diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..d75bc0b258f13c6dc5afaa68d77c2f4d609784d6 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,321 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00052-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00012-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00023-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00024-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00044-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00013-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00037-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00024-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00049-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00035-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00043-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00004-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00028-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00016-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00036-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00007-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00045-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00010-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00041-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00012-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00023-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00018-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00006-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00013-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00037-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00014-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00050-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00049-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00043-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00007-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00042-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00041-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00003-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00039-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00022-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00002-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00032-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00001-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00028-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00025-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00048-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00023-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00010-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00034-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00040-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00052-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00011-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00029-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00013-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00037-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00006-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00024-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00008-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00049-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00035-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00037-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00053-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00042-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00010-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00032-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00044-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00042-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00030-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00003-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00022-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00022-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00046-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00028-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00033-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00018-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00051-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00002-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00014-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00036-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00053-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00004-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00024-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00049-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00044-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00049-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00040-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00016-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00036-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00025-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00004-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00028-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00027-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00001-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00009-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00033-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00005-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00013-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00047-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00027-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00051-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00031-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00012-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00047-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00050-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00015-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00005-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00015-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00030-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00020-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00021-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00011-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00018-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00052-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00008-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00016-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00021-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00052-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00002-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00006-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00004-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00053-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00043-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00040-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00009-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00033-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00001-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00029-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00034-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00039-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00027-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00051-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00047-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00009-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00031-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00020-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00045-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00050-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00019-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00048-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00046-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00048-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00011-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00019-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00035-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00029-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00034-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00039-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00042-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00001-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00010-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00031-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00020-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00022-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00047-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00030-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00005-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00015-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00030-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00046-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00046-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00050-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00046-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00019-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00011-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00033-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00008-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00016-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00021-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00023-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00014-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00036-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00011-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00029-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00034-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00040-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00008-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00053-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00025-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00037-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00053-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00040-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00053-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00029-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00034-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00039-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00009-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00033-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00020-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00031-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00039-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00027-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00051-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00003-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00046-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00005-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00015-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00030-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00001-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00002-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00032-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00019-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00025-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00008-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00016-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00021-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00018-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00014-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00036-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00018-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00002-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00041-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00050-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00004-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00014-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00050-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00025-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00007-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00041-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00032-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00044-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00003-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00045-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00009-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00031-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00022-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00019-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00048-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00035-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00007-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00045-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00048-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00052-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00012-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00023-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00052-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00010-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00006-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00024-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00043-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00006-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00043-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00035-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00049-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00005-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00032-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00044-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00041-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00013-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00027-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00042-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00003-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00012-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00020-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00021-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00047-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00015-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00047-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00028-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-3T/pytorch_model-00051-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00048-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00007-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00045-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-1T/pytorch_model-00051-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00017-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00038-of-00053.bin filter=lfs diff=lfs merge=lfs -text
+model_hubs/Skywork-13B-Base-2T/pytorch_model-00026-of-00053.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/model_hubs/Skywork-13B-Base-0.5T/config.json b/model_hubs/Skywork-13B-Base-0.5T/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..176a4ca6fc2d7e436819a6c762c7967edb3a7b3f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/config.json
@@ -0,0 +1,27 @@
+{
+ "architectures": [
+ "SkyworkForCausalLM"
+ ],
+ "auto_map": {
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
+ },
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 0,
+ "hidden_act": "silu",
+ "hidden_size": 4608,
+ "initializer_range": 0.01,
+ "intermediate_size": 12288,
+ "max_position_embeddings": 131072,
+ "model_type": "skywork",
+ "num_attention_heads": 36,
+ "num_hidden_layers": 52,
+ "num_key_value_heads": 36,
+ "rms_norm_eps": 1e-06,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.33.1",
+ "use_cache": true,
+ "vocab_size": 65519
+ }
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-0.5T/configuration_skywork.py b/model_hubs/Skywork-13B-Base-0.5T/configuration_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbad8ae1e08d431a14c5de719267629feb4cd5a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/configuration_skywork.py
@@ -0,0 +1,89 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class SkyworkConfig(PretrainedConfig):
+
+ model_type = "skywork"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32000,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=1,
+ eos_token_id=2,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/model_hubs/Skywork-13B-Base-0.5T/generation_config.json b/model_hubs/Skywork-13B-Base-0.5T/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aece903f676603332b5bc1b1a29d6e44a8c02464
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/generation_config.json
@@ -0,0 +1,10 @@
+{
+ "bos_token_id": 1,
+ "do_sample": true,
+ "eos_token_id": 2,
+ "max_length": 4096,
+ "pad_token_id": 0,
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.33.1"
+}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-0.5T/modeling_skywork.py b/model_hubs/Skywork-13B-Base-0.5T/modeling_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d2898e0e7d379dc6883c4e34043e537689b8bb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/modeling_skywork.py
@@ -0,0 +1,911 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_skywork import SkyworkConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SkyworkConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class SkyworkRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ SkyworkRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class SkyworkRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+
+class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base * scaling_factor
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class SkyworkMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ if self.config.pretraining_tp > 1:
+ slice = self.intermediate_size // self.config.pretraining_tp
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+ gate_proj = torch.cat(
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+ )
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+ down_proj = [
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+ ]
+ down_proj = sum(down_proj)
+ else:
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+ return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class SkyworkAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = SkyworkRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "ntk":
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+ print('-'*80)
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+ query_slices = self.q_proj.weight.split(
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+ )
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+ query_states = torch.cat(query_states, dim=-1)
+
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+ key_states = torch.cat(key_states, dim=-1)
+
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+ value_states = torch.cat(value_states, dim=-1)
+
+ else:
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ if self.config.pretraining_tp > 1:
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+ else:
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class SkyworkDecoderLayer(nn.Module):
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = SkyworkAttention(config=config)
+ self.mlp = SkyworkMLP(config)
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+class SkyworkPreTrainedModel(PreTrainedModel):
+ config_class = SkyworkConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["SkyworkDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, SkyworkModel):
+ module.gradient_checkpointing = value
+
+class SkyworkModel(SkyworkPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
+
+ Args:
+ config: SkyworkConfig
+ """
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+ # embed positions
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, past_key_value, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class SkyworkForCausalLM(SkyworkPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = SkyworkModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ if self.config.pretraining_tp > 1:
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+ logits = torch.cat(logits, dim=-1)
+ else:
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = SkyworkModel(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00001-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00001-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..68e0ba481871914e25384a4d60be96ab901cce89
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00001-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a31f4919c53258dab61a55eda4e1a3a674cedff77566ce38de888ba35893503
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00002-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00002-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d9825b28adaf763ba80e9a5b4c8bd3a60e0363b6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00002-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4c455a54fc166e04f5f2783826ef3f7b74beab514fcc91ab2f04efde7067ced
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00003-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00003-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..db5572e995be1171b13c52eecad202301ae090ff
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00003-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1683e159abd4415ab7adc3e05d4c538fcaa7a6a9fe486022ab4bd7da51e3258e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00004-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00004-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5a8af62e7c6aba8e72e4fadd94b1bd8635f74b52
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00004-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1a27072ccdd11b982c8c0f858059df1f0b37fd592b9eafef3b15febb36ac22d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00005-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00005-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e6b23e75602a4e0bede449e066fdafb499d13a62
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00005-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5bfdbc9db75550a86acc6b1ca051067467fdde8e59f31f6782d0fbe4dca7be6
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00006-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00006-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..359a3067f7218a5ade3d3707bd61efccc8450d46
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00006-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36f1f2fc97ca7478dac47d8c29f3a1b792153916061a2ea4493bb934fdf6892c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00007-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00007-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c8be85295c58133e3739fc41bd8b00b2aae68fca
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00007-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a4baf264b16adf4f8a9dfd28b2ae470d9801726423d20ebcb40240a28eb53e3
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00008-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00008-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f415c9615398e467660b0d831d2f1b9f7a8a576e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00008-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da767d2df57e3372bc4f266de1f495da0164193a8baaa1a840bbd0fd77f12700
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00009-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00009-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..79c6cfb8cf4c51501dd0db0dc75d7bb08aaea994
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00009-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8472141a098d05382493adb5b318a3811fc6fb3b284e25b5e280c069f0c49ef4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00010-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00010-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..60f76760565a315f498a3412776f74aa5022d163
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00010-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d9fd7427fbe3372dece4622b32935122bba9681059571fd5fa15457f4b759e1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00011-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00011-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..24af9a8af0633b12d372e17ac55ec2b5ca441509
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00011-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f8a5543772f042997f1e953775a57773842f63c2f3bca7a9e8e1bc7e6dbdd73
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00012-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00012-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5230f688b66b536e90ce04fc9ceac9f3f334d2d8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00012-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c16c4018b58ab88967be5919676756d1f478ebb9cd3062d6c34c4cf049339b0d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00013-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00013-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6de2b9afd8f6ab4ac5515a8fbaf98f79f153fb15
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00013-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7e6e8e083df4e842b26d10324000e3f49ab4424778e214b7c2538476995732f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00014-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00014-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f20d93bd93e57dcc53bcf81aa49ea7efbe415c9b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00014-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88a60960cac6eed2cff625246d6c573da9bf5ce23165aadf8d039b141f3f8c9d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00015-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00015-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0f646beef08455b4fa16f8d4673617790dee05ba
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00015-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a5198b223e32e63633bc648f504f6cfc8d84fb5675eaf2bdc537c94d8fd02e2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00016-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00016-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2e1738555dd4cdc595d358b18ed01c5f0d575118
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00016-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9318391d7bb59fa558363226c166d11908a8b6ec9533c290e92d927cb97f1224
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00017-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00017-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4536041008eb1d637b9addd08c7739be9b8719f1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00017-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ee6f17f62b955c649e57dba17efe24d823871da85e2d3a6b88b16fd0e20501c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00018-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00018-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..24c9607bfda21578b827308219a2c45c66301e43
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00018-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a3c80de02dee6e8e145397ab2f81128a45d0688520a94c80758a192dd0fdacd
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00019-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00019-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1692d240a8a6245a30a5756263f0e69e9acbfe1a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00019-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dc0040360ed18afb1c062c744f62b275962d5f8eaebfaaec35bdfb8fa51d4b5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00020-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00020-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3290edef11ee3a00bc544222e99d99e07fd77c37
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00020-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:916a1b4af65c3c3a1d4b7bb1684eda39917c9007b07344ab20438d488ddc244d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00021-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00021-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bdc9183269d08e4d2bff9fa811aa6b25330b8bc7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00021-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:452e8ddc8ad0b4349cca9835c6097dacb7d088660f01681f2fbdfc1a986b871f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00022-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00022-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..224ff03a41e9ccb3d648d862622bb01b80f94383
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00022-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2074cf8594bc3000ac18254bdc3b9bb7acc0948dc5c1d45867431799ea686c8f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00023-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00023-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7a88db9b7003c152c7df280c9d01fdc90266d075
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00023-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e905d6e990ba090855e014f0c7a1ad6429a02633ac8e4aad5a58c8e4475e178
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00024-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00024-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d261ad0b8a24c65090c140b3686d1424d0ff3596
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00024-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61122a555b5fbe8146a5774652a3eb066b3571d01cec5067d51cb238d0b7dfae
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00025-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00025-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1ba0297c65da2ba6c451f336788cb6a3e62d8cd5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00025-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b272175eab7d652bf1acb048c4142f86d8ca2678f4e1e99aeacaa698f18e9532
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00026-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00026-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9d8618e489c1085a827484e210e836e0c2f29528
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00026-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2e954204f739f32f79a2bdb5ae88d219ddc243f89070b283bca7a6856f87656
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00027-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00027-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..340428e370f3e6e01df811dc410208f2cdcde637
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00027-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2d05d92bfad6ae69d5b186596adab0b4b3058cff074fca1793cbd64946a6679
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00028-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00028-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..32ecb1276a2e370b3145b1938b1d6ebe60e331a2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00028-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d7eb154702b0ff0cf5909e53463e67cbce2ffafbd2fe7db91c8b4864651a474
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00029-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00029-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3c1d9762282c53889e5dad8675d985639bb9ffd9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00029-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bac87079dcf5502ec351036692cf5986683795ad854cdfa4eeed4083d5b0405d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00030-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00030-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..85eddb81b9a08390d7a8e0b9104d0a7e8167c6fe
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00030-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:431985bf4b8d0720d7ec1659ad5d3bd595f97e127ecacf09dd88692f6ddd4ec1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00031-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00031-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3c6b30e2f4435c5627fda669af939092c803dc36
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00031-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a60a9a3101eb9cbbf9a0042c092c4975feb1a3ac181e9ff30f91422344c1cd10
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00032-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00032-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e1d904db1e61e20a5bd76c592e43b06f876b1875
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00032-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2694f89889cee13a04beff28699bb14b44091b4e9f09f36fb9d50653eddb75c3
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00033-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00033-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..18f96769992602c7702ce3daa55dcacd2085de23
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00033-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fe1abccae6a7536f40a0be9f355bffcfa45f8d655db8101c55a5d39e3c1ca1f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00034-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00034-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3f4d4bc8ebf162dc6e653c08932a705e5c718a8d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00034-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bc82b69f061a812bf8f43058d6f98cbaf6657350bf3449a2720c73a506a630f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00035-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00035-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f538a09ba8b4b15e7b271355405b733b92502d68
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00035-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e7d9bcec5917264f51ab6ef8d13d8e98e4fab1fd6a5eacaeeb50e671bae6d17
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00036-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00036-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fad940d74f9a26ef5ad1fadc0046903a0e661674
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00036-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a71d10be04bd0d72514b6d74a23e68d50bab61c4a277fa456a248124434aa794
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00037-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00037-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e5df2899adb52cb48a8b9d409a2300d6dc325bc3
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00037-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0aa975c5b15fda690656174ca0ede406d3df25727d8e1159bc8ce73ae2158475
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00038-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00038-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..118ad760db2712d7691d3e35f4b37eabc127a079
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00038-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ecb76118ddfc0e428aa5479ecb34fdc46eedefd0da653494f1ca6d10ef980a4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00039-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00039-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..df796f8b8bdd7b209163cc11a202a19f41f259d6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00039-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03ba83e021307ea6ab07bc6055e15b376a5b0419c0999f85995447d93c756a47
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00040-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00040-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fb41c0222164a9041101e490232e2087d0649d65
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00040-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ac254c4b4cb94906820e772b65e3796238625e019bccd145d863843449a0d2f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00041-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00041-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d2cfd6c42a9491c887f3d88b037d3a06c13658ec
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00041-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:968133952d5d3f517a8f58f615372573233eeb839ac98751d8065bd276acd2ee
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00042-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00042-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fae85ed1e52908a73d57c227e19a7656a21d92ec
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00042-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a7e38448cd38954248cc60f728324baa889539621c1b606d2149f6708548d5e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00043-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00043-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..beab2fc441a48030a188de9e1f99826d2ceac1d7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00043-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d6c91e4bfe5d715901fe0c1c7240b635a20cdde683c998ce0acfcf79ea3ae51
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00044-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00044-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5a6e58201d726c786031bf9cf6cb9f289c376e1f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00044-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78152fcbc76144402e87d3ca295029e5ae93ed66a2e435a9ed3e2e046cbf9b1d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00045-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00045-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9b54f17ad2cea4056748f1455b872bf35a3dd672
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00045-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5cbfe833f501565023fcbf877bc1377e67939c8196656c5a37f7a56102d35d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00046-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00046-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ac4d77fd0047f6757666edfac604141d4a154bee
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00046-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa53fb07d9fd3df5c3eeeb96f6e4663d5297102450f7f6501de90af186026b0a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00047-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00047-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c01e5bbefa0e09e4469833971296ec35f35bf8be
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00047-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bcd0401351f01d08c3058eb2d878652a8547647c3e901edda922b1e8c7bb5f2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00048-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00048-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f5d08fec021446f3a698e66cf0e41fe0b3152a4a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00048-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0f6c029d1eefd2290ff71f024fde4a3206fe17a209a01e93e6a8f96045a7364
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00049-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00049-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b86a18802325d174102d23fb0d9e620bdf5f9905
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00049-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf0e3d61ebe329eaf12c0153839436c9cc058cf75f080574af99452e09169f88
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00050-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00050-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f491a2aaadd608884d11f20697a3ead01ef8750c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00050-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c459343967a6e16156b848b3304bd47f7c2f25aceb41b6534696b3ea6274f0c0
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00051-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00051-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d9cb8014f8c1ec827e2fa7e50a30d8f40e693b96
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00051-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31ce5582a9ff29938edf10d10ff6d5fdea1990d0942b912a0ac644b5a9e14015
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00052-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00052-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..207cefb869e27c4a7755e1add29cf61d200bb5fa
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00052-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6661c31c9a3d1bec1c893709f3edeba9e8f56441fa571f588e455b354a7ec6fc
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00053-of-00053.bin b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00053-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..03cd7be951b2a7ba0b36f1343825bdfc3729d7ed
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model-00053-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:732eb499239c716bef18da7f4015d666ed1f3a75184a3133e32ad483bc189c66
+size 1207656908
diff --git a/model_hubs/Skywork-13B-Base-0.5T/pytorch_model.bin.index.json b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..163c37a78b34efe7cc858ea3fdca93e4c7c25699
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/pytorch_model.bin.index.json
@@ -0,0 +1 @@
+{"metadata": {"total_size": 27708239872}, "weight_map": {"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.up_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.down_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.2.input_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.up_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.down_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.3.input_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.up_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.down_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.4.input_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.up_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.down_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.5.input_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.up_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.down_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.6.input_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.up_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.down_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.7.input_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.up_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.down_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.8.input_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.up_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.down_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.9.input_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.up_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.down_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.10.input_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.up_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.down_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.11.input_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.up_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.down_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.12.input_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.up_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.down_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.13.input_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.up_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.down_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.14.input_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.up_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.down_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.15.input_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.up_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.down_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.16.input_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.up_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.down_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.17.input_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.up_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.down_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.18.input_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.up_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.down_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.19.input_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.up_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.down_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.20.input_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.up_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.down_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.21.input_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.up_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.down_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.22.input_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.up_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.down_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.23.input_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.up_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.down_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.24.input_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.up_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.down_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.25.input_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.up_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.down_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.26.input_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.up_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.down_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.27.input_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.up_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.down_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.28.input_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.up_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.down_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.29.input_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.up_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.down_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.30.input_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.up_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.down_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.31.input_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.up_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.down_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.32.input_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.up_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.down_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.33.input_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.up_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.down_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.34.input_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.up_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.down_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.35.input_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.up_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.down_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.36.input_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.up_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.down_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.37.input_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.up_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.down_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.38.input_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.up_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.down_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.39.input_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.up_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.down_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.40.input_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.rotary_emb.inv_freq": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.up_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.down_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.41.input_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.rotary_emb.inv_freq": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.up_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.down_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.42.input_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.rotary_emb.inv_freq": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.up_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.down_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.43.input_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.rotary_emb.inv_freq": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.up_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.down_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.44.input_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.rotary_emb.inv_freq": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.up_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.down_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.45.input_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.rotary_emb.inv_freq": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.up_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.down_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.46.input_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.rotary_emb.inv_freq": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.up_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.down_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.47.input_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.rotary_emb.inv_freq": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.up_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.down_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.48.input_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.rotary_emb.inv_freq": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.up_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.down_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.49.input_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.rotary_emb.inv_freq": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.up_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.down_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.50.input_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.rotary_emb.inv_freq": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.up_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.down_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.51.input_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.rotary_emb.inv_freq": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.up_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.down_proj.weight": "pytorch_model-00052-of-00053.bin", "model.norm.weight": "pytorch_model-00053-of-00053.bin", "model.embed_tokens.weight": "pytorch_model-00053-of-00053.bin", "lm_head.weight": "pytorch_model-00053-of-00053.bin"}}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-0.5T/special_tokens_map.json b/model_hubs/Skywork-13B-Base-0.5T/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..d85ba6cb6820b01226ef8bd40b46bb489041c6a8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-0.5T/tokenization_skywork.py b/model_hubs/Skywork-13B-Base-0.5T/tokenization_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac378d77d2d90d17340b3cb8eaf91bdb1656b71d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/tokenization_skywork.py
@@ -0,0 +1,250 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+"""Tokenization classes for Skywork."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+if TYPE_CHECKING:
+ from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<>\n", "\n<>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+class SkyworkTokenizer(PreTrainedTokenizer):
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token=None,
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ clean_up_tokenization_spaces=False,
+ legacy=True,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+ self.legacy = legacy
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ legacy=legacy,
+ **kwargs,
+ )
+ if legacy:
+ logger.warning_once(
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
+ )
+
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+ def tokenize(self, text, **kwargs) -> List[str]:
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+ # the beginning of the text
+ if not self.legacy:
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+ return super().tokenize(text, **kwargs)
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+ def _tokenize(self, text):
+ if not self.legacy:
+ is_first = text.startswith(SPIECE_UNDERLINE)
+ if is_first:
+ text = text[1:]
+
+ tokens = self.sp_model.encode(text, out_type=str)
+
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+ return tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for i, token in enumerate(tokens):
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special and i != 0:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ return out_string
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = bos_token_id + token_ids_0 + eos_token_id
+
+ if token_ids_1 is not None:
+ output = output + bos_token_id + token_ids_1 + eos_token_id
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ bos_token_id = [1] if self.add_bos_token else []
+ eos_token_id = [1] if self.add_eos_token else []
+
+ if token_ids_1 is None:
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+ return (
+ bos_token_id
+ + ([0] * len(token_ids_0))
+ + eos_token_id
+ + bos_token_id
+ + ([0] * len(token_ids_1))
+ + eos_token_id
+ )
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+ if token_ids_1 is not None:
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+ return output
+
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+ dialogue = list(conversation.iter_texts())
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+ [not is_user for is_user, msg in dialogue[1::2]]
+ ):
+ raise ValueError(
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+ )
+
+ dialog_tokens: List[int] = []
+ if len(conversation.past_user_inputs) > 0:
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+ conversation.past_user_inputs[0] = (
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+ )
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+ dialog_tokens += sum(
+ [
+ [self.bos_token_id]
+ + self.encode(
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+ )
+ + [self.eos_token_id]
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+ ],
+ [],
+ )
+ if not (dialogue[-1][0]):
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+ dialog_tokens += [self.bos_token_id] + self.encode(
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+ )
+ return dialog_tokens
diff --git a/model_hubs/Skywork-13B-Base-0.5T/tokenizer.model b/model_hubs/Skywork-13B-Base-0.5T/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..decbfe220922d6a38ff52541ef3927b97fb7893e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
+size 994250
diff --git a/model_hubs/Skywork-13B-Base-0.5T/tokenizer_config.json b/model_hubs/Skywork-13B-Base-0.5T/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c232b8b78a3ad2ce894b9a17628f3821627ccd7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-0.5T/tokenizer_config.json
@@ -0,0 +1,40 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": null,
+ "sp_model_kwargs": {},
+ "tokenizer_class": "SkyworkTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_skywork.SkyworkTokenizer",
+ null
+ ]
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-1.5T/config.json b/model_hubs/Skywork-13B-Base-1.5T/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..176a4ca6fc2d7e436819a6c762c7967edb3a7b3f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/config.json
@@ -0,0 +1,27 @@
+{
+ "architectures": [
+ "SkyworkForCausalLM"
+ ],
+ "auto_map": {
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
+ },
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 0,
+ "hidden_act": "silu",
+ "hidden_size": 4608,
+ "initializer_range": 0.01,
+ "intermediate_size": 12288,
+ "max_position_embeddings": 131072,
+ "model_type": "skywork",
+ "num_attention_heads": 36,
+ "num_hidden_layers": 52,
+ "num_key_value_heads": 36,
+ "rms_norm_eps": 1e-06,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.33.1",
+ "use_cache": true,
+ "vocab_size": 65519
+ }
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-1.5T/configuration_skywork.py b/model_hubs/Skywork-13B-Base-1.5T/configuration_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbad8ae1e08d431a14c5de719267629feb4cd5a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/configuration_skywork.py
@@ -0,0 +1,89 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class SkyworkConfig(PretrainedConfig):
+
+ model_type = "skywork"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32000,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=1,
+ eos_token_id=2,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/model_hubs/Skywork-13B-Base-1.5T/generation_config.json b/model_hubs/Skywork-13B-Base-1.5T/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aece903f676603332b5bc1b1a29d6e44a8c02464
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/generation_config.json
@@ -0,0 +1,10 @@
+{
+ "bos_token_id": 1,
+ "do_sample": true,
+ "eos_token_id": 2,
+ "max_length": 4096,
+ "pad_token_id": 0,
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.33.1"
+}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-1.5T/modeling_skywork.py b/model_hubs/Skywork-13B-Base-1.5T/modeling_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d2898e0e7d379dc6883c4e34043e537689b8bb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/modeling_skywork.py
@@ -0,0 +1,911 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_skywork import SkyworkConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SkyworkConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class SkyworkRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ SkyworkRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class SkyworkRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+
+class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base * scaling_factor
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class SkyworkMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ if self.config.pretraining_tp > 1:
+ slice = self.intermediate_size // self.config.pretraining_tp
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+ gate_proj = torch.cat(
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+ )
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+ down_proj = [
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+ ]
+ down_proj = sum(down_proj)
+ else:
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+ return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class SkyworkAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = SkyworkRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "ntk":
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+ print('-'*80)
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+ query_slices = self.q_proj.weight.split(
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+ )
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+ query_states = torch.cat(query_states, dim=-1)
+
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+ key_states = torch.cat(key_states, dim=-1)
+
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+ value_states = torch.cat(value_states, dim=-1)
+
+ else:
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ if self.config.pretraining_tp > 1:
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+ else:
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class SkyworkDecoderLayer(nn.Module):
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = SkyworkAttention(config=config)
+ self.mlp = SkyworkMLP(config)
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+class SkyworkPreTrainedModel(PreTrainedModel):
+ config_class = SkyworkConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["SkyworkDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, SkyworkModel):
+ module.gradient_checkpointing = value
+
+class SkyworkModel(SkyworkPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
+
+ Args:
+ config: SkyworkConfig
+ """
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+ # embed positions
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, past_key_value, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class SkyworkForCausalLM(SkyworkPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = SkyworkModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ if self.config.pretraining_tp > 1:
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+ logits = torch.cat(logits, dim=-1)
+ else:
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = SkyworkModel(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00001-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00001-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e1deed0459b5a5fc2d4104bfd32f9c2aee26afee
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00001-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9222a35f4b5c7a84bcdd18e08099620dccedc6de77d5da81d2dc304287ee58
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00002-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00002-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..14b26e0fd05b733aa5ba248733aa2a05624d9cd1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00002-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7374d4b58ad1013ff1df033cb984bd09850e33483686dcdbb91b0a07540db004
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00003-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00003-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..102beee331d0f10282bf4bcb637d66e551e36ca7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00003-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f7e5998a3681000e2475e477838a8d8b38b934dc7082182219dc157cbb49871
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00004-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00004-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ffaf66f76710f0f6180d096eb48371d8b1338bb6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00004-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acfad47261e35591279d2331db6c4ee23089b490c54a5352aa56f3c2918b2fa7
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00005-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00005-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0aaf290e727940635750a8342fd4355128198827
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00005-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e403f276c75717a36e35f0fdd5a377c6d4fcdf9420453d67705d1216c2f81ea0
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00006-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00006-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6e65e7e016751be09a6068bc15d7499d56a99791
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00006-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1e36064758b011168d10d54fae2a11208a0ffa4910f89f8bdd09e503779466b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00007-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00007-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0dc5a29717d6c6d55f98df4297423453936a2de2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00007-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ec240616fdfe6aea4882e62f8138bcc7d969cefb44bf8bbc612f99e3d413293
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00008-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00008-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..58cc14e219b891c4a29cb5d754dad625d86c4742
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00008-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3dca5956b79be22a757ed10b6735a93f195df1ce4c05aa587ef28e94100fb99
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00009-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00009-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f4427b0763a2c01b2ccc2801ccd9e12eb9b1984e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00009-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:deac730eabf4c1ffcec17ed7ad3a30adbfe9f1d6849bab7c7ac549c3124bc692
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00010-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00010-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0c8668161a6df1c361fd691cc58920ff013ff7fd
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00010-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82700ed731ded20cca643e0e9b697eff18913c346d89d56b9e9eb85e10ff5d0f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00011-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00011-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..eb425f7bd84768807845bbd06d7c7a1962013189
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00011-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4ee8c2b37b4a4fcf5c59ef7b07cdea394ee06439081a5e44bb01d1177dade97
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00012-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00012-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a572c4a85054c45b196a8426ae25cdf46c24d321
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00012-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9593f2c31426f0356946f88a4b02e00c94d43c747e356965fdd4c182cb4fe7b1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00013-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00013-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..757000ce9792c5c8c1225a6740e97bc85ac4d785
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00013-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4f1ca9d5993d6a51a36f837381a40303b7924df09de47bddbf744aed19f41a2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00014-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00014-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..718f6b837f1aff48f7f9f1af34a946c905076f12
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00014-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44ba6a86fbb0f7e038664421497801cedbfc2e8c0d89f80e05b6d9b184e26abd
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00015-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00015-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ee0f906187a9e638126ebabe4f919d4f9aecf88
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00015-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abbc447878e9b1c55313c91a4fd689c576c45b603e70a82d8a49a791903daacd
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00016-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00016-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8138b9c67c9e7dd24056563c4aa38c7554095046
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00016-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90a113664c6092ac83a2d3124cf7852acbdc97075d38cf075a746c34a5a37dbb
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00017-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00017-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..28d3ebe9ac1e98f0efc351b95531b1d84a49eb98
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00017-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba71a6b26543f229d6a50d7f7b99ea9346977a7d8b702553ed3c2290c89a4f85
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00018-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00018-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..27133a0b4a610582cee5356363db4e16075f36d1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00018-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca169e097b176a7fcde0dd8faa51be6f4a9cb75befef54b72bfa7ccf5c05823
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00019-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00019-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..526ee947c8fbf3d101b9534f3da712b1b178f016
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00019-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d80d2806cfe9b54ee2b46d11435a5364f47ede442316806eaba704fde46cd110
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00020-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00020-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..93577a07345582d41fc62c9d877f707e5197308c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00020-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:455524b4d620059e53eea3960327c05a8b356400426f9457d224308ce4af0341
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00021-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00021-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e0fbf05bd9a84eed4a58ec80beead25b55d5f7b6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00021-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:994be4a2fd80da6b01004cdee63a81f57662cf18129649eeef52d67ebd4893bb
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00022-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00022-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..78eadacc1b0fce67bd7bf55fe6fe48c2ad5a2663
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00022-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:489957071be8f166e224a0a5d5237a85064b29e4908ea9c31f813294b29ba29c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00023-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00023-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7a535064f2f4e3e237fc84ce2a3c08c41c26d143
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00023-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8e2ab108ea6ff131f3116a47b16d0e2b709ffe4f22ad8a947c1976abc1372e2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00024-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00024-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..efd17eb879da09038c24355d41633832057138ad
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00024-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e93ef759c3ed184524ea1ba91071a476919871dc5ac3f23a9c691a7b9170f4f8
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00025-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00025-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0c24f1680fd4f3bae01ad6e56e83bf694ae51f06
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00025-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c72a8b11fd78728f69df0256ae479229864795b397739e40e178d37d880d1cd7
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00026-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00026-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..516b0320159ee9be43dfef12318253d5e8d0706c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00026-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a5c8c02c61b959955da8044ffc1f39041558cd495fa9e90f1a8765e2dd270ce
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00027-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00027-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..02d10e49341a0480b15ed24d31943a3dcdd21df4
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00027-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42634a603a9c2d3481d99c83251dba6f4b202b2f28f22d68f9d0f80352fbdbe6
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00028-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00028-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d9fe093b71f2a895f71ed6c2640aab6467683bfe
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00028-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec66473844d5e3fa133b1c68072d2c2c101c8032dece3b3627674ec0f931a459
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00029-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00029-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..115df2eddee804380f7ed405715024107130dce9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00029-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a94e1822581abe78d661c2ebce0983d9db6431269707ce097cce09ae2546a28
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00030-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00030-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..80bceb06be7b61fddaa2bef22c4911e3287e7fc2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00030-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:160a59528f4d38b391a461a2d410b57eb9eee7c65cb055860052b217d16d77f4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00031-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00031-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1eb39be3998e925ca2da27582cbc66ce2684e1d2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00031-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d8029a662118b96baac250dddeb97bcfd7792ab40efdeed1c5390bbc865b305
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00032-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00032-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29f2dfc346f50dc09da1fba0444fedc293ec1a1e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00032-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efe00f3fbcc92c69caa26343eaac701b21a98cc85692c39d70306f8ed9a2d661
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00033-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00033-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..20bbec45909c921b1d74854c46fc9add65f2fdf7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00033-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0c66fc28dcee73a187ee1c7f8c6a3142e7016e1d7abcf89912a12c2b51f21cf
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00034-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00034-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dff8e82108bfb753c28e93141b40c4ae78f12d17
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00034-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a474698e86b2cdaaea3f267b09688f8aac3c28e5a30f862504ebd6834b185a2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00035-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00035-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..08c0f475e4f5cb20bc84a9d2ef2c4329920c1406
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00035-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d739c1b98009082100c75cbaa783dc58db5ea72f587728f96e58a07e1b46dd51
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00036-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00036-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..eeae3a79d18eeb783b27dcd55465b8ce8fdf8424
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00036-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90f7c9b60a0066e944084804982bad67071d9456bc2e803b0d9abc84967e10ec
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00037-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00037-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..60f289cfed2062f8db007e135961c0e06c913fde
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00037-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb4c60d93df71d9235bfc2fdf30cb80fe79be519bfd533efb37f3ca981e71b5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00038-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00038-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..077a18a9b1ee3ec8f69bd9032d8188adec949223
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00038-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6eb632db5be27b6bf34c9c459f37a855dd1e553384947fd56c21dbbfbd265cbe
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00039-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00039-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e8406a802681d7b94b7600ceb94340957a0cafea
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00039-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7873b7442e491d5f49d46abd07e12501e56112003571a11f8ae0ade6f13ec4f2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00040-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00040-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c6f30ce7ce6a7b2539ccbf2f29e66cae849d802d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00040-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c24e817c5808e6811dc556d323a61699adfa10844194e06b4158a7758bae1239
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00041-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00041-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f3e7a390dd546eed53b64a15f4a931b11fc1e4c6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00041-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c68b23748a266ce0d120f0aee728416737ebb43dd1d07c5559741d05690d612
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00042-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00042-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a9c5c4308ab5c8d9db4a913ea3d11f6f7c5bd7c9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00042-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8eecdb9d38874da8834db316c60912bed482d81ca9c64fa9beea882be2359d8c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00043-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00043-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2da93d7aeeb44ee5263c930081db3644796caf9f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00043-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d8a098df3383c5fb94a665ce30e8328238fa1adba6e33522e5c59fc0951248f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00044-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00044-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1843b55cf1eb9fd28aaee9e9b0f16e7cd6723cbd
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00044-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e5cc1add1eee78d99817c787fd4d076e5edaa54750a40dd374dfd6c386e68f7
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00045-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00045-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..30154ebeeb20bccb5f5345e3aa1dbdb02fdf9860
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00045-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a295a2463f369e95c4c1f65408fa9da48cbffdb60b8e1fcb00697c2ec3bde4ee
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00046-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00046-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b1b679d1c138a8d373503906be5ad6b0a443b61
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00046-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75b63f11d9e5b439519bb397851afc1a4cce087361a0ce3502d322acddfcfe60
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00047-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00047-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c6610c59f55027931ca2959e11d0fbe01a599ace
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00047-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4203ce92e8c3f18fc27c50f930251248271bac2d2ee91356d9444befa9183085
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00048-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00048-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5e40cefd197aac2fffa422bc8a2a59ddaad14761
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00048-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06ccd7d953a52f7554e42379f69287e3ee4cf5203ceded0513748ae1f43cd020
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00049-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00049-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f37b0f1da5eea21436bf251785d4f780b8470f00
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00049-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d3b7e59a999e377060df991dde24a440bf34bd223f551770ae9061fffa5e08d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00050-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00050-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..03b7799c2d91a5a917461b7c06aa18cd0bc1e7f7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00050-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d90ea1117b9e73ae638366f181e2c48cb1a9d69a00293a59464a8b761a8ef4b2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00051-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00051-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c766c242bdd694c5ca2a9c6d3483d41a63bde2d0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00051-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:353d136a9f889d85135a487cb8431570144509133987750b2334fb6fea95189c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00052-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00052-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a45583a6d0bb36745b8c8d941a7f6f41ed2f30ec
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00052-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fde2b3e27ee813ade8fed20e5e40b563d62476860e6a2a471c4ffe43b9c102e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00053-of-00053.bin b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00053-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2720fabc67def00de74062d901013f2a957fbbcc
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model-00053-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:767e29491eb1143811c3dd2daed7d990be943e91cacc7a4120165f5f13b669f2
+size 1207656908
diff --git a/model_hubs/Skywork-13B-Base-1.5T/pytorch_model.bin.index.json b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..163c37a78b34efe7cc858ea3fdca93e4c7c25699
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/pytorch_model.bin.index.json
@@ -0,0 +1 @@
+{"metadata": {"total_size": 27708239872}, "weight_map": {"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.up_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.down_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.2.input_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.up_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.down_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.3.input_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.up_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.down_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.4.input_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.up_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.down_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.5.input_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.up_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.down_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.6.input_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.up_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.down_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.7.input_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.up_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.down_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.8.input_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.up_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.down_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.9.input_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.up_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.down_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.10.input_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.up_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.down_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.11.input_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.up_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.down_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.12.input_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.up_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.down_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.13.input_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.up_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.down_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.14.input_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.up_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.down_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.15.input_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.up_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.down_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.16.input_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.up_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.down_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.17.input_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.up_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.down_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.18.input_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.up_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.down_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.19.input_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.up_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.down_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.20.input_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.up_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.down_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.21.input_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.up_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.down_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.22.input_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.up_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.down_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.23.input_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.up_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.down_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.24.input_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.up_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.down_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.25.input_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.up_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.down_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.26.input_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.up_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.down_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.27.input_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.up_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.down_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.28.input_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.up_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.down_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.29.input_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.up_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.down_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.30.input_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.up_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.down_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.31.input_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.up_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.down_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.32.input_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.up_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.down_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.33.input_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.up_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.down_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.34.input_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.up_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.down_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.35.input_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.up_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.down_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.36.input_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.up_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.down_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.37.input_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.up_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.down_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.38.input_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.up_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.down_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.39.input_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.up_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.down_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.40.input_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.rotary_emb.inv_freq": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.up_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.down_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.41.input_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.rotary_emb.inv_freq": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.up_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.down_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.42.input_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.rotary_emb.inv_freq": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.up_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.down_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.43.input_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.rotary_emb.inv_freq": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.up_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.down_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.44.input_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.rotary_emb.inv_freq": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.up_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.down_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.45.input_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.rotary_emb.inv_freq": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.up_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.down_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.46.input_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.rotary_emb.inv_freq": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.up_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.down_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.47.input_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.rotary_emb.inv_freq": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.up_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.down_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.48.input_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.rotary_emb.inv_freq": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.up_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.down_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.49.input_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.rotary_emb.inv_freq": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.up_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.down_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.50.input_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.rotary_emb.inv_freq": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.up_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.down_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.51.input_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.rotary_emb.inv_freq": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.up_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.down_proj.weight": "pytorch_model-00052-of-00053.bin", "model.norm.weight": "pytorch_model-00053-of-00053.bin", "model.embed_tokens.weight": "pytorch_model-00053-of-00053.bin", "lm_head.weight": "pytorch_model-00053-of-00053.bin"}}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-1.5T/special_tokens_map.json b/model_hubs/Skywork-13B-Base-1.5T/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..d85ba6cb6820b01226ef8bd40b46bb489041c6a8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-1.5T/tokenization_skywork.py b/model_hubs/Skywork-13B-Base-1.5T/tokenization_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac378d77d2d90d17340b3cb8eaf91bdb1656b71d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/tokenization_skywork.py
@@ -0,0 +1,250 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+"""Tokenization classes for Skywork."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+if TYPE_CHECKING:
+ from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<>\n", "\n<>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+class SkyworkTokenizer(PreTrainedTokenizer):
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token=None,
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ clean_up_tokenization_spaces=False,
+ legacy=True,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+ self.legacy = legacy
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ legacy=legacy,
+ **kwargs,
+ )
+ if legacy:
+ logger.warning_once(
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
+ )
+
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+ def tokenize(self, text, **kwargs) -> List[str]:
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+ # the beginning of the text
+ if not self.legacy:
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+ return super().tokenize(text, **kwargs)
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+ def _tokenize(self, text):
+ if not self.legacy:
+ is_first = text.startswith(SPIECE_UNDERLINE)
+ if is_first:
+ text = text[1:]
+
+ tokens = self.sp_model.encode(text, out_type=str)
+
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+ return tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for i, token in enumerate(tokens):
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special and i != 0:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ return out_string
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = bos_token_id + token_ids_0 + eos_token_id
+
+ if token_ids_1 is not None:
+ output = output + bos_token_id + token_ids_1 + eos_token_id
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ bos_token_id = [1] if self.add_bos_token else []
+ eos_token_id = [1] if self.add_eos_token else []
+
+ if token_ids_1 is None:
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+ return (
+ bos_token_id
+ + ([0] * len(token_ids_0))
+ + eos_token_id
+ + bos_token_id
+ + ([0] * len(token_ids_1))
+ + eos_token_id
+ )
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+ if token_ids_1 is not None:
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+ return output
+
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+ dialogue = list(conversation.iter_texts())
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+ [not is_user for is_user, msg in dialogue[1::2]]
+ ):
+ raise ValueError(
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+ )
+
+ dialog_tokens: List[int] = []
+ if len(conversation.past_user_inputs) > 0:
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+ conversation.past_user_inputs[0] = (
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+ )
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+ dialog_tokens += sum(
+ [
+ [self.bos_token_id]
+ + self.encode(
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+ )
+ + [self.eos_token_id]
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+ ],
+ [],
+ )
+ if not (dialogue[-1][0]):
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+ dialog_tokens += [self.bos_token_id] + self.encode(
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+ )
+ return dialog_tokens
diff --git a/model_hubs/Skywork-13B-Base-1.5T/tokenizer.model b/model_hubs/Skywork-13B-Base-1.5T/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..decbfe220922d6a38ff52541ef3927b97fb7893e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
+size 994250
diff --git a/model_hubs/Skywork-13B-Base-1.5T/tokenizer_config.json b/model_hubs/Skywork-13B-Base-1.5T/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c232b8b78a3ad2ce894b9a17628f3821627ccd7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1.5T/tokenizer_config.json
@@ -0,0 +1,40 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": null,
+ "sp_model_kwargs": {},
+ "tokenizer_class": "SkyworkTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_skywork.SkyworkTokenizer",
+ null
+ ]
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-1T/config.json b/model_hubs/Skywork-13B-Base-1T/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..176a4ca6fc2d7e436819a6c762c7967edb3a7b3f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/config.json
@@ -0,0 +1,27 @@
+{
+ "architectures": [
+ "SkyworkForCausalLM"
+ ],
+ "auto_map": {
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
+ },
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 0,
+ "hidden_act": "silu",
+ "hidden_size": 4608,
+ "initializer_range": 0.01,
+ "intermediate_size": 12288,
+ "max_position_embeddings": 131072,
+ "model_type": "skywork",
+ "num_attention_heads": 36,
+ "num_hidden_layers": 52,
+ "num_key_value_heads": 36,
+ "rms_norm_eps": 1e-06,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.33.1",
+ "use_cache": true,
+ "vocab_size": 65519
+ }
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-1T/configuration_skywork.py b/model_hubs/Skywork-13B-Base-1T/configuration_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbad8ae1e08d431a14c5de719267629feb4cd5a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/configuration_skywork.py
@@ -0,0 +1,89 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class SkyworkConfig(PretrainedConfig):
+
+ model_type = "skywork"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32000,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=1,
+ eos_token_id=2,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/model_hubs/Skywork-13B-Base-1T/generation_config.json b/model_hubs/Skywork-13B-Base-1T/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aece903f676603332b5bc1b1a29d6e44a8c02464
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/generation_config.json
@@ -0,0 +1,10 @@
+{
+ "bos_token_id": 1,
+ "do_sample": true,
+ "eos_token_id": 2,
+ "max_length": 4096,
+ "pad_token_id": 0,
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.33.1"
+}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-1T/modeling_skywork.py b/model_hubs/Skywork-13B-Base-1T/modeling_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d2898e0e7d379dc6883c4e34043e537689b8bb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/modeling_skywork.py
@@ -0,0 +1,911 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_skywork import SkyworkConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SkyworkConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class SkyworkRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ SkyworkRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class SkyworkRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+
+class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base * scaling_factor
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class SkyworkMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ if self.config.pretraining_tp > 1:
+ slice = self.intermediate_size // self.config.pretraining_tp
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+ gate_proj = torch.cat(
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+ )
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+ down_proj = [
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+ ]
+ down_proj = sum(down_proj)
+ else:
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+ return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class SkyworkAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = SkyworkRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "ntk":
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+ print('-'*80)
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+ query_slices = self.q_proj.weight.split(
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+ )
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+ query_states = torch.cat(query_states, dim=-1)
+
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+ key_states = torch.cat(key_states, dim=-1)
+
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+ value_states = torch.cat(value_states, dim=-1)
+
+ else:
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ if self.config.pretraining_tp > 1:
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+ else:
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class SkyworkDecoderLayer(nn.Module):
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = SkyworkAttention(config=config)
+ self.mlp = SkyworkMLP(config)
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+class SkyworkPreTrainedModel(PreTrainedModel):
+ config_class = SkyworkConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["SkyworkDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, SkyworkModel):
+ module.gradient_checkpointing = value
+
+class SkyworkModel(SkyworkPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
+
+ Args:
+ config: SkyworkConfig
+ """
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+ # embed positions
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, past_key_value, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class SkyworkForCausalLM(SkyworkPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = SkyworkModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ if self.config.pretraining_tp > 1:
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+ logits = torch.cat(logits, dim=-1)
+ else:
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = SkyworkModel(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00001-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00001-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4547d65956506a55f4e333855f567389860e8b07
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00001-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c92030a7974426d1c7c19d5f998b616d66c9d87d15ffe109dafd2a9ccb90bef9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00002-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00002-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3e690073608546675b298be0a938fd1bcaec10b0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00002-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d27d372bfb0d4b06f3d51b265fa20f8187d684995f62eed4d973faffb21ca373
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00003-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00003-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f395d3b212ef8c68d3d4ac15b3886e571fdabb9b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00003-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff695cb24f4a9f29e0bde7b2ba87f4996883680151d593130dc6594ab11c11ee
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00004-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00004-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5de0d776b5856a0c9388b942ca4de3e331a80451
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00004-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd1f1fd8ec08511a91436a9750f1b1ff42327fcc7b0f2df9515239b4793718b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00005-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00005-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e59a76bc6d5cd05c2110c286a538b4a23e301a03
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00005-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b0a606ee737473b7f2db1aa73e92e6c2a81241d27ee84ac6ca8196e173910df
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00006-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00006-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..45cff42fe18b232398dc70540e4e66768d090c1a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00006-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ec0442620d979be432e067276f0a1cc12cef65415b82b0c6fffd257db3b3802
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00007-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00007-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5766eb58126d855d123c380d24adf9a31682e0ac
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00007-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:292a4e16a3b3cd7dd687d083f3f6867fff8f728c4ac1fbc8d35659f33bf47e06
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00008-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00008-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ac1311510a7e5af9eac6e8ec67d99209bf76bcb4
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00008-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdfb348caa29371204c83a5961327bd77d3d23a2983d75c4590819a52d6da10e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00009-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00009-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9331da848724b83d5809268717d31e6ad613ee95
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00009-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:936b10a504e6cb04946def3f40d161c7e5bca5deef68d3173e970ac49f88b11a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00010-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00010-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..12bbaa5252ee6c33cb016742168459c9be4dd170
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00010-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e80e09a46aba0770b6f54a44d7d90034f043d73468e7837f9c2db0ec395705c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00011-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00011-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4aca6125c1b0864c2236cb1fa034ff197ee67c03
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00011-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:449d76b1c3fc279dfac5181b8c28539547481d062c7837c4c6589e1508086f13
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00012-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00012-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..870fcbc1c46f7e1d145a9870d9db6c83d745935d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00012-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bd47cbd5a5ef80854f6a15a15fcff889fdb719f8a4861d1b1bac97840814be2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00013-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00013-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b0371577cd936db758123f1c073ba605237b55b3
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00013-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3231d31509ff1fe5778a34d0754bc59f3e0f0001e4fdbcebf2c659d965ab81f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00014-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00014-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c56157a5395961e3950085dfcaa04a5dff5729b1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00014-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a5f2ad7797d25951fb499f7d908c0a51f6770a7b836c67a27b74e3742672ca9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00015-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00015-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0de44d3619ff1eb43688c33c47713f7b53694159
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00015-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cb76b1985f996b19d425e393077566255a968f685e8693bb7f1fd7faa51b33d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00016-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00016-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f10a9ad8f84291083bced6ed1481f980faf8ad33
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00016-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0747c78b23ca6b6526cb5796b75d5a2e974af2472a235c98b1c6d14c4cf775d2
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00017-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00017-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b34684ea2a3261abe45fc1e6fa741f5672ef5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00017-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57554dc876dddad87c5e53c5c46211bb2deb36ef2e5030ffde0abaa2dc9bb9ff
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00018-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00018-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..64b4856d0f633db610d13095df2795e964cbc2b4
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00018-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c03a8e4b377566c635eb5602ab35593263e476d975861ce420ce7ef1032a44d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00019-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00019-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..34d615faa70ebcdb21c87be350bff5bb9dd13515
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00019-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5415c63e08f846d3c3be18eb4a82bec8b3568bd7d5383f3b938546a2d8fbda34
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00020-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00020-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8aef93bedf287ae96ed3bbb466eafb81c3c94a97
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00020-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:295a6a5c6cde907867758af594fcdfea658b635f7be07adc224b2a25e2f92c68
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00021-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00021-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..36674c6ac0b7b3cf7a26aa67df6fdd6866293bbb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00021-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1fe88467a7abda16884e21c67c86becdb7bea7fe6d2305e6745ed3ef49e5bc4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00022-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00022-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..486443baa7db7ac17aa496e0f4421eabca79e1b1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00022-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1314b19fab09f2bd814d1462e4194d2d8605791c1c99ccdac5ce887287bcca39
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00023-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00023-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f158f14d93149615577abfd31c157614ecbd64bd
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00023-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a20ea7d843fcc3cd22607373f5b4f7f3b54628d24863879e8ee27b8bbc2cbdd
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00024-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00024-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..694070652ca1bc0f1b4f4f9613213df880402661
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00024-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e797062c80281d5e99b979198b7c201e930a65bae65f50bd9a52e79c28888d51
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00025-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00025-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1e172650d45d4dc9e5363a7920908efc4cefd658
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00025-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c2beccd8b6b545f8fb2a6af740adc787da0dc3a58122570bb2b1b81c099f861
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00026-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00026-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d72512cfe33ccc1ceea0f4dfafe0eadde268465c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00026-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a62dc80b2736b6cc0bf893f03eb7c59ace013611ee7e5c07430eab3182a039a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00027-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00027-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1e0d24f41741d58cb7c0d5f1d96d55a1102d53af
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00027-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11b1ceaf61d844bd6d1d1d50f48d5ac0b862b8134198912543a136870d801ad9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00028-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00028-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2dad5586b022df8fac1dfdb354a2c538fa702904
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00028-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7a24f29e2d0c4a8eee237086a7ca170c802d87211a6360e15806d3ed1c5918e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00029-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00029-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..123be4ce877e92fab89af64b1536a756b6dbdadf
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00029-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc1ca01484a09b9bef817342ce4c16ee0c34fef80c64e8e27a7b8ecfc678be2a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00030-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00030-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b7a33b35334d7ec2f32b533e38b01f14fe8f323f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00030-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:049e6c90342774294732ec96c6ec08fd236fd6d0341e6b55263fbaeecb042d82
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00031-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00031-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..84a1936465d9a02764d65aeacd8986e3ed8d576a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00031-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:990f517ffa67094e78046e45286f0dd38635815756b94efbb2e0b2e5bd731b6c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00032-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00032-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..de1c6dd384b37a4296604312ff62efa938a57a90
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00032-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db04b0455c222d1ab365cf17b908c6aa1152d5403551d79200d22b93f6bff3ed
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00033-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00033-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b918d9a616e5ac0c5513c3da8230b5e23cf3efef
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00033-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0dfae986ccfc2ea7d31d52bc6e1488696237770bd97d4f33330b9fb47a41059
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00034-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00034-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c01eadb995bfdf69601604cf181153f62e8abd70
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00034-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:941cf744d0ccc66b7931168088efaec5e30b97fd9db849fe1fc08ac5e63d2945
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00035-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00035-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1a9805c09cea6bf3ae6060ce2f4aaaa814b7ad05
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00035-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f388cf8601794bcb515d7b40c9072155777b74348f25bdfe845ea21fa88de784
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00036-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00036-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fbbec238964402607bd22f4f32e1bece46f28d02
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00036-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9117395759fdf3126876bb155d33c31ab4c2858c8597d38df7610818a65849da
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00037-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00037-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..126307a37811054a71543b81ef611d6aa598342b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00037-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f72701b596e77b52af882b610fd5cdb183dc24a68cea0dfaa3367edb74e56698
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00038-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00038-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..595a80b5f344e4ce537a5de6c0e59cca712464ae
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00038-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c03f66bdbbbe7306f3243652febf37ae4ed1144f300f7a7e874a94ccfc3bb7b4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00039-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00039-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..77718267c2bdce0152ef571e187d5f36fe528911
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00039-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6c43eabc4ee96712cba57cc0572c9ff5c8fac4d85c0b6de4d258bde4ec4506
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00040-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00040-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..35453e6e8a13f68cdb650c2c9a605434a76f075d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00040-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f89bbf6845742af65f6cffb9965e29a2997a5a120938e7ff4cdd4b832e79d69
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00041-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00041-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8702cf863fed58d7ed1d8ddfb27f26691afb2eb0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00041-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:927e235f69e10b6dbb5070ba3bd340f05adfc056737de89605a4016e78f2c805
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00042-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00042-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..886fe7bf1aae7cb5fc19e6d4c23206d623ef9e89
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00042-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c08b96aa3cef9376525fd308257dcd5ab78288f940fe993ea521741f187dd134
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00043-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00043-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..943377a8422d0b4af76706cd00f4b04adfd81924
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00043-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:946bd96de1abc496e525a5d437340ded17988be41b9e2845203150f194ac5f19
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00044-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00044-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..483e1d7a2915c485d513d99705ac67f783ff89e9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00044-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6e98baf24d5ec868b515a259207a2a3d930e9aed2db41e8500a84db5d47bb2f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00045-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00045-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..633ac37850a9afc2c3ede814732353344937ace8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00045-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4380e10f209b74e2d726fee6505d12ef3d10d6191e47d9c5ad070dd2a4cfc562
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00046-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00046-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2c7cccd3ccc35ec0293c1b9861f593e250ac1180
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00046-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d58ae63033ebefcdf20ad32fddbf8b7819c8a7c576f9477149f79ecb57ec2903
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00047-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00047-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..84114fa86bcf14481402b8fad5381682524984ee
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00047-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd7f001e0b90b33ec6503f10476e23c8a3905d4c15b704bfc7f9f27eec753277
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00048-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00048-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6e329aa327cfcf50af8cc067c3cf8b2348f98d5c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00048-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3268e00d3f69baecafaee23f68de230d1994c1b7d94b1f6aff3993032c06ad1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00049-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00049-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3a5ee6e169b01425f2c5491b1b41f63935d06e25
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00049-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fca9d97f387de4921becb3b6c67bf151fe3215d786df2db79239016ee2228172
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00050-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00050-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f48964f8cea4af3a023c42c71d681d7353b72aca
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00050-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d39beadc5d6b6f8ae9ca6f3f8884ef460f218f5b536fc8571ffb9dbdbb6106f1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00051-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00051-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9b9df26141a1b91d9129f4f07f7418e5ce6c65b1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00051-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f6612f1ca596bd5e1b8a744e86ef5a9282ca243447766373616eefaa409e74
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00052-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00052-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b4321b1417a8a1ab354ba81a2104c79b94b910fb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00052-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e87df3574c6f6018b05f42d0f6150de24d23cfbeaa738620a6416e069fc9b970
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model-00053-of-00053.bin b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00053-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..405e82a4af2ec87a8526aa75dc433c46f996fb66
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model-00053-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6fdda48c87276dbb0b44e1ace934d04e7efd6c23d97358bfd639bc222b333cc
+size 1207656908
diff --git a/model_hubs/Skywork-13B-Base-1T/pytorch_model.bin.index.json b/model_hubs/Skywork-13B-Base-1T/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..163c37a78b34efe7cc858ea3fdca93e4c7c25699
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/pytorch_model.bin.index.json
@@ -0,0 +1 @@
+{"metadata": {"total_size": 27708239872}, "weight_map": {"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.up_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.down_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.2.input_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.up_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.down_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.3.input_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.up_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.down_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.4.input_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.up_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.down_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.5.input_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.up_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.down_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.6.input_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.up_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.down_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.7.input_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.up_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.down_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.8.input_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.up_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.down_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.9.input_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.up_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.down_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.10.input_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.up_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.down_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.11.input_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.up_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.down_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.12.input_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.up_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.down_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.13.input_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.up_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.down_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.14.input_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.up_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.down_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.15.input_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.up_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.down_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.16.input_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.up_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.down_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.17.input_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.up_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.down_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.18.input_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.up_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.down_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.19.input_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.up_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.down_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.20.input_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.up_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.down_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.21.input_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.up_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.down_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.22.input_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.up_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.down_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.23.input_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.up_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.down_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.24.input_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.up_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.down_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.25.input_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.up_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.down_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.26.input_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.up_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.down_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.27.input_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.up_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.down_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.28.input_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.up_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.down_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.29.input_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.up_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.down_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.30.input_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.up_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.down_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.31.input_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.up_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.down_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.32.input_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.up_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.down_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.33.input_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.up_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.down_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.34.input_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.up_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.down_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.35.input_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.up_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.down_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.36.input_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.up_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.down_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.37.input_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.up_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.down_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.38.input_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.up_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.down_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.39.input_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.up_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.down_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.40.input_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.rotary_emb.inv_freq": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.up_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.down_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.41.input_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.rotary_emb.inv_freq": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.up_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.down_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.42.input_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.rotary_emb.inv_freq": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.up_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.down_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.43.input_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.rotary_emb.inv_freq": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.up_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.down_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.44.input_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.rotary_emb.inv_freq": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.up_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.down_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.45.input_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.rotary_emb.inv_freq": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.up_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.down_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.46.input_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.rotary_emb.inv_freq": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.up_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.down_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.47.input_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.rotary_emb.inv_freq": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.up_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.down_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.48.input_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.rotary_emb.inv_freq": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.up_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.down_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.49.input_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.rotary_emb.inv_freq": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.up_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.down_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.50.input_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.rotary_emb.inv_freq": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.up_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.down_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.51.input_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.rotary_emb.inv_freq": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.up_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.down_proj.weight": "pytorch_model-00052-of-00053.bin", "model.norm.weight": "pytorch_model-00053-of-00053.bin", "model.embed_tokens.weight": "pytorch_model-00053-of-00053.bin", "lm_head.weight": "pytorch_model-00053-of-00053.bin"}}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-1T/special_tokens_map.json b/model_hubs/Skywork-13B-Base-1T/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..d85ba6cb6820b01226ef8bd40b46bb489041c6a8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-1T/tokenization_skywork.py b/model_hubs/Skywork-13B-Base-1T/tokenization_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac378d77d2d90d17340b3cb8eaf91bdb1656b71d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/tokenization_skywork.py
@@ -0,0 +1,250 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+"""Tokenization classes for Skywork."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+if TYPE_CHECKING:
+ from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<>\n", "\n<>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+class SkyworkTokenizer(PreTrainedTokenizer):
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token=None,
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ clean_up_tokenization_spaces=False,
+ legacy=True,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+ self.legacy = legacy
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ legacy=legacy,
+ **kwargs,
+ )
+ if legacy:
+ logger.warning_once(
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
+ )
+
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+ def tokenize(self, text, **kwargs) -> List[str]:
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+ # the beginning of the text
+ if not self.legacy:
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+ return super().tokenize(text, **kwargs)
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+ def _tokenize(self, text):
+ if not self.legacy:
+ is_first = text.startswith(SPIECE_UNDERLINE)
+ if is_first:
+ text = text[1:]
+
+ tokens = self.sp_model.encode(text, out_type=str)
+
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+ return tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for i, token in enumerate(tokens):
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special and i != 0:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ return out_string
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = bos_token_id + token_ids_0 + eos_token_id
+
+ if token_ids_1 is not None:
+ output = output + bos_token_id + token_ids_1 + eos_token_id
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ bos_token_id = [1] if self.add_bos_token else []
+ eos_token_id = [1] if self.add_eos_token else []
+
+ if token_ids_1 is None:
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+ return (
+ bos_token_id
+ + ([0] * len(token_ids_0))
+ + eos_token_id
+ + bos_token_id
+ + ([0] * len(token_ids_1))
+ + eos_token_id
+ )
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+ if token_ids_1 is not None:
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+ return output
+
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+ dialogue = list(conversation.iter_texts())
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+ [not is_user for is_user, msg in dialogue[1::2]]
+ ):
+ raise ValueError(
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+ )
+
+ dialog_tokens: List[int] = []
+ if len(conversation.past_user_inputs) > 0:
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+ conversation.past_user_inputs[0] = (
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+ )
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+ dialog_tokens += sum(
+ [
+ [self.bos_token_id]
+ + self.encode(
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+ )
+ + [self.eos_token_id]
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+ ],
+ [],
+ )
+ if not (dialogue[-1][0]):
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+ dialog_tokens += [self.bos_token_id] + self.encode(
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+ )
+ return dialog_tokens
diff --git a/model_hubs/Skywork-13B-Base-1T/tokenizer.model b/model_hubs/Skywork-13B-Base-1T/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..decbfe220922d6a38ff52541ef3927b97fb7893e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
+size 994250
diff --git a/model_hubs/Skywork-13B-Base-1T/tokenizer_config.json b/model_hubs/Skywork-13B-Base-1T/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c232b8b78a3ad2ce894b9a17628f3821627ccd7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-1T/tokenizer_config.json
@@ -0,0 +1,40 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": null,
+ "sp_model_kwargs": {},
+ "tokenizer_class": "SkyworkTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_skywork.SkyworkTokenizer",
+ null
+ ]
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-2.5T/config.json b/model_hubs/Skywork-13B-Base-2.5T/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..176a4ca6fc2d7e436819a6c762c7967edb3a7b3f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/config.json
@@ -0,0 +1,27 @@
+{
+ "architectures": [
+ "SkyworkForCausalLM"
+ ],
+ "auto_map": {
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
+ },
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 0,
+ "hidden_act": "silu",
+ "hidden_size": 4608,
+ "initializer_range": 0.01,
+ "intermediate_size": 12288,
+ "max_position_embeddings": 131072,
+ "model_type": "skywork",
+ "num_attention_heads": 36,
+ "num_hidden_layers": 52,
+ "num_key_value_heads": 36,
+ "rms_norm_eps": 1e-06,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.33.1",
+ "use_cache": true,
+ "vocab_size": 65519
+ }
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-2.5T/configuration_skywork.py b/model_hubs/Skywork-13B-Base-2.5T/configuration_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbad8ae1e08d431a14c5de719267629feb4cd5a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/configuration_skywork.py
@@ -0,0 +1,89 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class SkyworkConfig(PretrainedConfig):
+
+ model_type = "skywork"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32000,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=1,
+ eos_token_id=2,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/model_hubs/Skywork-13B-Base-2.5T/generation_config.json b/model_hubs/Skywork-13B-Base-2.5T/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aece903f676603332b5bc1b1a29d6e44a8c02464
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/generation_config.json
@@ -0,0 +1,10 @@
+{
+ "bos_token_id": 1,
+ "do_sample": true,
+ "eos_token_id": 2,
+ "max_length": 4096,
+ "pad_token_id": 0,
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.33.1"
+}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-2.5T/modeling_skywork.py b/model_hubs/Skywork-13B-Base-2.5T/modeling_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d2898e0e7d379dc6883c4e34043e537689b8bb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/modeling_skywork.py
@@ -0,0 +1,911 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_skywork import SkyworkConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SkyworkConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class SkyworkRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ SkyworkRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class SkyworkRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+
+class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base * scaling_factor
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class SkyworkMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ if self.config.pretraining_tp > 1:
+ slice = self.intermediate_size // self.config.pretraining_tp
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+ gate_proj = torch.cat(
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+ )
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+ down_proj = [
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+ ]
+ down_proj = sum(down_proj)
+ else:
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+ return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class SkyworkAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = SkyworkRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "ntk":
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+ print('-'*80)
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+ query_slices = self.q_proj.weight.split(
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+ )
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+ query_states = torch.cat(query_states, dim=-1)
+
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+ key_states = torch.cat(key_states, dim=-1)
+
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+ value_states = torch.cat(value_states, dim=-1)
+
+ else:
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ if self.config.pretraining_tp > 1:
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+ else:
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class SkyworkDecoderLayer(nn.Module):
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = SkyworkAttention(config=config)
+ self.mlp = SkyworkMLP(config)
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+class SkyworkPreTrainedModel(PreTrainedModel):
+ config_class = SkyworkConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["SkyworkDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, SkyworkModel):
+ module.gradient_checkpointing = value
+
+class SkyworkModel(SkyworkPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
+
+ Args:
+ config: SkyworkConfig
+ """
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+ # embed positions
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, past_key_value, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class SkyworkForCausalLM(SkyworkPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = SkyworkModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ if self.config.pretraining_tp > 1:
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+ logits = torch.cat(logits, dim=-1)
+ else:
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = SkyworkModel(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00001-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00001-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d29dde85fd166eebb50485fcfece4872d44cbbb1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00001-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e10274adcd037904dc419146bbf41f12ddd177c0c132de499903355a479614a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00002-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00002-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c447941aa487bc5976320047c4bbf4fe71cbe21d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00002-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08c434ed45ee48a31f8c0798d31e3c46a8cc5900ebf4e92ceaa8be05d64a5c67
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00003-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00003-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..98260298b5f3555d9d258a7380935a075e8a4a21
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00003-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c514bfe8908bb817ad0930088806f72128856a1020a6456bdf725405464b9282
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00004-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00004-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bc54a29d239a5cdccd1e6ea63d247f50ca094ce0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00004-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc9a829e59b8a3144dfc1a2ee9f3a9a8e2b6b59c607cad1882dffbc281c2a891
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00005-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00005-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e6c3e1da5ecfee6e08ea6da6792424e85fefc3a8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00005-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:154523a29298f9ec4e51015f421b4f2ad044560ab175d4de2f4be1bd21b7b88c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00006-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00006-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e598bff590096195084a05d83e2fb6ea9b760239
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00006-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c75135659be64215b9782bd0b641f383ca036407037e1ee3d092ae05ed6302a9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00007-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00007-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bfd6f2d3fc1e05d8067aeb200d9a990f454d4b1a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00007-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:375bc0fc8fa4160493cd2f2797593d6a5e3d419dec0c1ce031103b78e8dca502
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00008-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00008-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f4335c2464d87ae5ee9452a5added88d7b89cfdc
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00008-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e03b9c77fdb8ac6602366b3c4d932ff918d9972dbdd166fd13500cb10b20480c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00009-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00009-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c0e5a38371e42b96e6b5146feb6fdd2f610e339e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00009-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:159b04daaf4313b32a761540bd3174b54e38359ebba85161db83d7f69ce99238
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00010-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00010-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..71123268199739e299bcfb92616095d76eab70e7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00010-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59459e927d8e801d1a5ff1d4421fe8e57c45bbdc0e0fb9c4ecaa60bd8ea48f68
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00011-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00011-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b83be6575e9f6912e4f94498f1a2836a9ce961da
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00011-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7426985ac8ef099c5ce6c91d87219042445f5b3ae7b7baf5b8d7467a09acc0fd
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00012-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00012-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8c6e9b7f912ee86f6273b31eba47284d872b0d8c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00012-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8dcd6b51db362756131052bc95ca7cd47e262fbb0fd0a790375335a1c2937a6
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00013-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00013-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5d0d21ad04df68aad056443735568725dc24195c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00013-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4007aae335f97167667edeb08b6b9cf74f40ad1c869cb60524de7bfacb805b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00014-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00014-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1a066ec6989d86c35950f27e4560f84e0d848c24
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00014-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:888f9ea30c5f43b2c8a7bb8789bed9ab410fa15cfb89fb3b9656c1ad0b1a3d0a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00015-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00015-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b6a598364ee52812907582f02c76dbc67102e2a5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00015-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0c9e8a57b22de70396be67304377f8ca66a58737a6f0b1df8177aa7b1231756
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00016-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00016-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a52bbb10a4e3209368cc82817d5c3e60b6d9a2fe
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00016-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f917e62f494b159089257c516b40bee157e8f00552c6ce774776ec91bb5c90e5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00017-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00017-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bff5d8ee140b6d83db62666a552e165c1047b2c2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00017-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af4efc65412a830e9cb191d362274b79e112b22c2ec38e68b575be900aff5ccd
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00018-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00018-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ba8ee0ceec638b75a2e3acec3619f00e75bbab5a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00018-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f3f7eaefe50a99f5230ac3fcb0151ea1d0b7bf6842f97d85b17a67198396917
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00019-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00019-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8b704f33d69b81b63078e0aecaf7f248600f6cee
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00019-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5536b530176fcc1f971b7a98d3e477b1dabe2fc519d533e10888d004c0313f59
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00020-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00020-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5253c4e56d512122c5c049a8ca292b9c40a3a0c6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00020-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:749df819d7b2ec6d000c79d4e0d0a8243486507ca38bcf8bce5081861b8e2c36
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00021-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00021-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c51611eb7865d148c74e764e81978eed99372205
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00021-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04155c5d222e77639720333d104dddfb7b24ea95a7af88d28683bdf8d985b8f6
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00022-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00022-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2e02886510439528aee1ca33bd4ec82c1753a798
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00022-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd81eaba8ec559425aef80f0df925fe5ee48e557705ada27e2b0e6b12ca715b3
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00023-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00023-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9676bbdf2aa2f00a96505262ee181a1d62eb5b6f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00023-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f3a6a4f23be4eeb0e0f4a851d621d455f1741e9b7d786638dec590eb70a2497
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00024-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00024-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2e09bc37be2e11b6bc59a80e0587f7cf2915fbf3
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00024-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:860e1bc701689ba9f2e703a4b6066ebf658c911e5c336d7eb354bc331cfb1ffc
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00025-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00025-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9d048ee4738222407833f01e45f531ac6dcdee29
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00025-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81a725dea2589cef8dae9af5864478cbf42f545eb658732afd25d31f6ede02d9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00026-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00026-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9f6318da982fc9179dfb16cbcdbfaf28bed0c83c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00026-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0da642e09f8fdb10211232c6a2b73183b136731e216ad3154ee3b36fe05aebce
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00027-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00027-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7c26dc06e8fab3da16ba329933e5805b06c7f6c1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00027-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b64954d813b7ea339af68bc4f29002a5092ad053304552844eb04c80d98d992
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00028-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00028-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a5041bbb2d7134c53bdd295ae4a8466b3a37079b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00028-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9630398ba4e3111fe6700a76211d69aeaf9fbba67941bfd7da3a515b4898754
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00029-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00029-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c6fadd15153b5fd141311091fa0c8d4c936f1a82
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00029-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3694a26e9f2e1d39dffdde5ae5fc5825be298350755ab52190fbe68d5b1257e4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00030-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00030-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d3b755b78802557465ebfb97c1de8ee6a179ee4b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00030-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4a2667fb28a7d528befcc8d1f06d2990b33e7a9de95d0f9979f1441d742b409
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00031-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00031-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c5e5c757d5ace94db8f555b2fb9ca47a0371026b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00031-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0d123c26bad6c1229ccce94622505e863d8111c844d469a23feb4660ef365d1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00032-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00032-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..700a69bc120d5198c682b79aecb204e4063120e5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00032-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61aeecb656b727f834985f4d84f90ffc92d706006b4299de2ee36477f92c93c9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00033-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00033-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bfa8feb3e4a3c26a865ae3ccca90a5edbc56af78
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00033-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d960ed85d56a676d67013f5490ad11ddef2bab401bc363d6778ebf9e82541db4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00034-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00034-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..773f84a35d317be8685d077dba33f82bc9719749
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00034-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2f9c0d6b6ad409186777573a604fc1b7b46c5223449fb5b742a18b2993e48c4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00035-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00035-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..51347bd2ea8e8bd59a5217b1bb36a94d8cd6bf4c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00035-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:978068936cfa921331887e086861dcfd23205cdc84c87283a2362ed8f8de71a5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00036-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00036-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..798b52eb100a8dc4e450fe210b231d5333272df7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00036-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41eb4257386015722049543d70b43b4a3291b7d3bee51e746a9105e133e6619e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00037-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00037-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b4d24f142ad30ca51d659594e77213e37cbed5c5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00037-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:751780b24372d7036d1ea4a2f00dac9280e40402b64bef9815ff9cae73b07d11
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00038-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00038-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cdd66e0b34dc87df3f806bf7bb595363b15c44ce
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00038-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4401b8b90d7f7549eeab352663ac406a8baf5c665938236ef0c2cbd80621f355
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00039-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00039-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3a7df08f4d27d327dce56f8cec9fb2fdbe9cb4ed
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00039-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ef6e9f88e4e53fe46c058280ce6cc3e9670fe67d42b1fe367b2e15eabe5a8fb
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00040-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00040-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8884d9da742a0ae681bf8345479ac37a2c3964ad
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00040-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07263f3757902713caa5081952abf4ed1638f8bcee6d82130aa87a63b4fd2f4d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00041-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00041-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bc4420fe50614d04ffc8af2289acc3a84109d260
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00041-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2b3896f32b639244d020fcd9b99ec2b6780b08d3506f157f26b4d4fae41bc13
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00042-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00042-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c7480663b8bf2f7de92ca349c00aa1276c50b469
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00042-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09e7380704c1c039b59db9ecb52de680204f66852571e21904c9acae94fd9b14
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00043-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00043-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c9011de9b52dfdb8127bc371983bf37fe0c0343c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00043-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1016a04fb6640b52264d691bd4a5cc5d42cd88e071808a7cf128c68254ab3676
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00044-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00044-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..87f5cd363d1236736bc6ef34b6989e4cd77a4fdb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00044-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c0978f7bad012f346c0f7c02e24b8254c9e74bea06ba570e7f9befd51c9beea
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00045-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00045-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..78b03e06c6bfb23a254bb2e6339ae16cd6d36e94
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00045-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14aaf38efa7ca1a5eda74c261eb57f9503a8bebc9c32d6954ee4facbb34239b6
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00046-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00046-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..72226d8f059356481664c3933c6451ec82ee3cc8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00046-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80283d502f78ec4dfb12900644afadcfa6de479cefe9fac8753b34cc275dd9c5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00047-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00047-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..db1de2e1758995efea22fe52b56389b0b986c28b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00047-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:314dbea56db1538b09d2a4b9f5147a505effb6be91fb74f2b0d2ab1f2088c6cf
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00048-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00048-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2d4f7cfeef1e3501b2bf052676ebae8d463b2508
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00048-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:260f93076cb59b1bc9f78c5a373138b2e8a60067d419e1f7b7ed90ed51889668
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00049-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00049-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ef84665ab2c625afd82b5ccff84c61753f9606a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00049-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:005fee3689c2285363bd7393ba37ac78f1cc37122c1d36f78808e4e693ae775b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00050-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00050-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0a59bc7c9eb3b154f080e8bf0475f057ffc1fe2c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00050-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa8d5abe44d1ca5bb4805b208865ba61b17ffdcaaf8abff2ae6b321174f1705f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00051-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00051-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bc22a92d0197c72ff8b65f908b349a1b89e7b1cf
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00051-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e51b0f63e8ef6208eb3638f88269da4790e21d842179738c85f6adfd7eb61f91
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00052-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00052-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..da84b09cccb3c15fa54517b0c9dc7d7a85e7458b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00052-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb043f4affb7b84e7fbf4f24ed85461dc4638e4dd7ecb3e0cb47d939e0d397b7
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00053-of-00053.bin b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00053-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e14f1b4fb33d8f922d5ef590953b93537bf18754
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model-00053-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc19f6875a6faf209628f95d5ea372ab780489b268ac12b3acbd3cee6500ee56
+size 1207656908
diff --git a/model_hubs/Skywork-13B-Base-2.5T/pytorch_model.bin.index.json b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..163c37a78b34efe7cc858ea3fdca93e4c7c25699
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/pytorch_model.bin.index.json
@@ -0,0 +1 @@
+{"metadata": {"total_size": 27708239872}, "weight_map": {"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.up_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.down_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.2.input_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.up_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.down_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.3.input_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.up_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.down_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.4.input_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.up_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.down_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.5.input_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.up_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.down_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.6.input_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.up_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.down_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.7.input_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.up_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.down_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.8.input_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.up_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.down_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.9.input_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.up_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.down_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.10.input_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.up_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.down_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.11.input_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.up_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.down_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.12.input_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.up_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.down_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.13.input_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.up_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.down_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.14.input_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.up_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.down_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.15.input_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.up_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.down_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.16.input_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.up_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.down_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.17.input_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.up_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.down_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.18.input_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.up_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.down_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.19.input_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.up_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.down_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.20.input_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.up_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.down_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.21.input_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.up_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.down_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.22.input_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.up_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.down_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.23.input_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.up_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.down_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.24.input_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.up_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.down_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.25.input_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.up_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.down_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.26.input_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.up_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.down_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.27.input_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.up_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.down_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.28.input_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.up_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.down_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.29.input_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.up_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.down_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.30.input_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.up_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.down_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.31.input_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.up_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.down_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.32.input_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.up_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.down_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.33.input_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.up_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.down_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.34.input_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.up_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.down_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.35.input_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.up_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.down_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.36.input_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.up_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.down_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.37.input_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.up_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.down_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.38.input_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.up_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.down_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.39.input_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.up_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.down_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.40.input_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.rotary_emb.inv_freq": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.up_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.down_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.41.input_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.rotary_emb.inv_freq": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.up_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.down_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.42.input_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.rotary_emb.inv_freq": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.up_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.down_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.43.input_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.rotary_emb.inv_freq": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.up_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.down_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.44.input_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.rotary_emb.inv_freq": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.up_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.down_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.45.input_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.rotary_emb.inv_freq": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.up_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.down_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.46.input_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.rotary_emb.inv_freq": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.up_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.down_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.47.input_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.rotary_emb.inv_freq": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.up_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.down_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.48.input_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.rotary_emb.inv_freq": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.up_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.down_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.49.input_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.rotary_emb.inv_freq": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.up_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.down_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.50.input_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.rotary_emb.inv_freq": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.up_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.down_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.51.input_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.rotary_emb.inv_freq": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.up_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.down_proj.weight": "pytorch_model-00052-of-00053.bin", "model.norm.weight": "pytorch_model-00053-of-00053.bin", "model.embed_tokens.weight": "pytorch_model-00053-of-00053.bin", "lm_head.weight": "pytorch_model-00053-of-00053.bin"}}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-2.5T/special_tokens_map.json b/model_hubs/Skywork-13B-Base-2.5T/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..d85ba6cb6820b01226ef8bd40b46bb489041c6a8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-2.5T/tokenization_skywork.py b/model_hubs/Skywork-13B-Base-2.5T/tokenization_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac378d77d2d90d17340b3cb8eaf91bdb1656b71d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/tokenization_skywork.py
@@ -0,0 +1,250 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+"""Tokenization classes for Skywork."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+if TYPE_CHECKING:
+ from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<>\n", "\n<>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+class SkyworkTokenizer(PreTrainedTokenizer):
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token=None,
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ clean_up_tokenization_spaces=False,
+ legacy=True,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+ self.legacy = legacy
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ legacy=legacy,
+ **kwargs,
+ )
+ if legacy:
+ logger.warning_once(
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
+ )
+
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+ def tokenize(self, text, **kwargs) -> List[str]:
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+ # the beginning of the text
+ if not self.legacy:
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+ return super().tokenize(text, **kwargs)
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+ def _tokenize(self, text):
+ if not self.legacy:
+ is_first = text.startswith(SPIECE_UNDERLINE)
+ if is_first:
+ text = text[1:]
+
+ tokens = self.sp_model.encode(text, out_type=str)
+
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+ return tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for i, token in enumerate(tokens):
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special and i != 0:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ return out_string
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = bos_token_id + token_ids_0 + eos_token_id
+
+ if token_ids_1 is not None:
+ output = output + bos_token_id + token_ids_1 + eos_token_id
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ bos_token_id = [1] if self.add_bos_token else []
+ eos_token_id = [1] if self.add_eos_token else []
+
+ if token_ids_1 is None:
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+ return (
+ bos_token_id
+ + ([0] * len(token_ids_0))
+ + eos_token_id
+ + bos_token_id
+ + ([0] * len(token_ids_1))
+ + eos_token_id
+ )
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+ if token_ids_1 is not None:
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+ return output
+
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+ dialogue = list(conversation.iter_texts())
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+ [not is_user for is_user, msg in dialogue[1::2]]
+ ):
+ raise ValueError(
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+ )
+
+ dialog_tokens: List[int] = []
+ if len(conversation.past_user_inputs) > 0:
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+ conversation.past_user_inputs[0] = (
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+ )
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+ dialog_tokens += sum(
+ [
+ [self.bos_token_id]
+ + self.encode(
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+ )
+ + [self.eos_token_id]
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+ ],
+ [],
+ )
+ if not (dialogue[-1][0]):
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+ dialog_tokens += [self.bos_token_id] + self.encode(
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+ )
+ return dialog_tokens
diff --git a/model_hubs/Skywork-13B-Base-2.5T/tokenizer.model b/model_hubs/Skywork-13B-Base-2.5T/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..decbfe220922d6a38ff52541ef3927b97fb7893e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
+size 994250
diff --git a/model_hubs/Skywork-13B-Base-2.5T/tokenizer_config.json b/model_hubs/Skywork-13B-Base-2.5T/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c232b8b78a3ad2ce894b9a17628f3821627ccd7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2.5T/tokenizer_config.json
@@ -0,0 +1,40 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": null,
+ "sp_model_kwargs": {},
+ "tokenizer_class": "SkyworkTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_skywork.SkyworkTokenizer",
+ null
+ ]
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-2T/config.json b/model_hubs/Skywork-13B-Base-2T/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..176a4ca6fc2d7e436819a6c762c7967edb3a7b3f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/config.json
@@ -0,0 +1,27 @@
+{
+ "architectures": [
+ "SkyworkForCausalLM"
+ ],
+ "auto_map": {
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
+ },
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 0,
+ "hidden_act": "silu",
+ "hidden_size": 4608,
+ "initializer_range": 0.01,
+ "intermediate_size": 12288,
+ "max_position_embeddings": 131072,
+ "model_type": "skywork",
+ "num_attention_heads": 36,
+ "num_hidden_layers": 52,
+ "num_key_value_heads": 36,
+ "rms_norm_eps": 1e-06,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.33.1",
+ "use_cache": true,
+ "vocab_size": 65519
+ }
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-2T/configuration_skywork.py b/model_hubs/Skywork-13B-Base-2T/configuration_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbad8ae1e08d431a14c5de719267629feb4cd5a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/configuration_skywork.py
@@ -0,0 +1,89 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class SkyworkConfig(PretrainedConfig):
+
+ model_type = "skywork"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32000,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=1,
+ eos_token_id=2,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/model_hubs/Skywork-13B-Base-2T/generation_config.json b/model_hubs/Skywork-13B-Base-2T/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aece903f676603332b5bc1b1a29d6e44a8c02464
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/generation_config.json
@@ -0,0 +1,10 @@
+{
+ "bos_token_id": 1,
+ "do_sample": true,
+ "eos_token_id": 2,
+ "max_length": 4096,
+ "pad_token_id": 0,
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.33.1"
+}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-2T/modeling_skywork.py b/model_hubs/Skywork-13B-Base-2T/modeling_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d2898e0e7d379dc6883c4e34043e537689b8bb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/modeling_skywork.py
@@ -0,0 +1,911 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_skywork import SkyworkConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SkyworkConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class SkyworkRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ SkyworkRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class SkyworkRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+
+class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base * scaling_factor
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class SkyworkMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ if self.config.pretraining_tp > 1:
+ slice = self.intermediate_size // self.config.pretraining_tp
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+ gate_proj = torch.cat(
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+ )
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+ down_proj = [
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+ ]
+ down_proj = sum(down_proj)
+ else:
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+ return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class SkyworkAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = SkyworkRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "ntk":
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+ print('-'*80)
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+ query_slices = self.q_proj.weight.split(
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+ )
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+ query_states = torch.cat(query_states, dim=-1)
+
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+ key_states = torch.cat(key_states, dim=-1)
+
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+ value_states = torch.cat(value_states, dim=-1)
+
+ else:
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ if self.config.pretraining_tp > 1:
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+ else:
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class SkyworkDecoderLayer(nn.Module):
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = SkyworkAttention(config=config)
+ self.mlp = SkyworkMLP(config)
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+class SkyworkPreTrainedModel(PreTrainedModel):
+ config_class = SkyworkConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["SkyworkDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, SkyworkModel):
+ module.gradient_checkpointing = value
+
+class SkyworkModel(SkyworkPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
+
+ Args:
+ config: SkyworkConfig
+ """
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+ # embed positions
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, past_key_value, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class SkyworkForCausalLM(SkyworkPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = SkyworkModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ if self.config.pretraining_tp > 1:
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+ logits = torch.cat(logits, dim=-1)
+ else:
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = SkyworkModel(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00001-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00001-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..45b8dd75bfcf60a3144d913b97eec5646aca9b92
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00001-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28201f8b56bd1e6120dd074a5c1c1f00d0991187d960647defc0081c599ead00
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00002-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00002-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1bff0b6b82afc73e6b6d202b7f610a67379c8893
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00002-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:718d2eb6be263ca5311ad8e6392214ef48b101c559b1434a45aae55f778b1405
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00003-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00003-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e34d86915108c78f0457ca929e54985c9bc185f1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00003-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:293bab6de7054e7127519e1ff8f76cf9077d2215960db01deeeb01f7dc5e4563
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00004-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00004-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..78bd69d2fbc2b1ed6262d1e7cfe5b79f066243cd
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00004-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5c567bb7659f0d17e9efe18c1bdbcb0caa118819574447f2ebb80bbacf683fa
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00005-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00005-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6efcc592dfa4844bd1c238abe00cd9054e936825
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00005-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e7945296b886733cf4c8353852f331aeafa371c9c104e3da196818e48fc1ed6
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00006-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00006-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..262578a87fa08b955f141ac144ae1691f9809bff
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00006-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bfcb8e6e51a66027cce7f0acc033318edc9f7aef9f3279eb5d2afe392678924
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00007-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00007-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..24ccdb4326907cd59c81b208e4f8c906c4a481a9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00007-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba02e2a6f170f8027db0b6becfaa9ad57f058dd6b4ea51154fca69c92a7a1f2a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00008-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00008-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..12872e840edd15f4277cc454a3c0b5d5e6b40132
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00008-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bbcb64020284520e37c7ac7916455db85a58931751eed433911e9d2a157f076
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00009-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00009-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f11c17365be29037c5d8cc52e933d4b3b632b4a0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00009-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0776822fbbfed4369e4e28471c18e3009319a3b8d0fd99117cec12151f3fdb83
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00010-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00010-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a3cc5bc0db81a46419912189675a5baf5ea6c9f0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00010-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b026cf8a4682a29d5509629bac591c098909e717d1f84dccffca7f82d1cbc00
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00011-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00011-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a0c8746306a71c526c97aaad5cb653c23eb77acd
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00011-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2d62c5687d21ff897a06197ecac0abb3cd899a65644d424b67e8a0b5b668ca5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00012-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00012-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..59bd69aa58487b7daaf5f7e7c623703d4aa8cf33
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00012-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f26a60d4103a76312c80c2535d195dda5282c6ea8bea74341df0c84cd7592a8d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00013-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00013-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0ef9ea09be64c40b1cee4aa0ed6c3c17005bb34a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00013-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3a868a9d091030f733759c3bb42be59ece162d694ae42b1bfa76f063f6f7858
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00014-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00014-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b7c9def5d920abf28bb34edfc453199d719921eb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00014-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4798838651a32150d89502fd54270c53cd150a7bead08eddb7712db1b0d82b82
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00015-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00015-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..966cc09cc018fa192fe64f7e2d52c704a59f0059
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00015-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f0ec5c3cf1971116f7b0abee91ccc0f00d8ade60a0e8e176180bcc074c2c1ea
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00016-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00016-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..85c3950d57b25d42d9940633c6cde4e5bfbc35ce
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00016-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75a08f7bf6279d8edd92a4a6f57ce824570738829e5aa4c81c6610787ae20997
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00017-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00017-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..818cf36e3e333b3f77c6aba26b9e01da19f3c7ca
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00017-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdd073c51c6e894f14438a1c43de1758db3871a453b603486327df370711d8a9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00018-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00018-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e247c824281fc4f58a61561c881f9edc208fb93a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00018-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49592d2889e164dc19e57c49d310a1b81e01c4dcff70e541e2aa8625ed7c28ee
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00019-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00019-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ce3642813a0c338a095232dd4e9087ee549d720a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00019-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eef49a0eede8af0d2ee37858af4a5799265c82cf11076f666a0a06e6ddd4b473
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00020-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00020-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ee2251d3d601229308ea7581be8813b113912fc6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00020-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33864ea7071f2f0a21953dd60621fce31201fea066c74f13d36125707c25b31f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00021-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00021-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b0f75e4f37ac6db52d2a36fcbbdbd80462466f5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00021-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fbf160b17232ed6a2ffb43dd3e71d92ececac823e187e4e725fbd034e18619d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00022-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00022-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bac3e6591bcb8f427096a746169ad216ea9a1449
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00022-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3d180e80b5b271963b14407915dbac67a3e4b79ff4c5b94acb70504b7d8ee19
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00023-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00023-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d90959a2bcf8729ecebd98d700cda545acd45895
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00023-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc6a7cca4c7bff1c64872d81f1618cce0302c3c203904f4d8a133adeb271c6e9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00024-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00024-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb4f42eb108889768eb7b0450844b89b45d45965
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00024-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a47db350e97ffe955e881b292590e4272093ac33ceb1fbd89da6b8ff06f76e26
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00025-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00025-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b2badd44e446e1373c286096ecb8ad16b3f15bb3
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00025-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10b314d27053574956523e7a5545f7c8e5a45ac2cc90eb3bde56aaa727892143
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00026-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00026-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..219eee62f20430c2e2aeac2677ef4366dd4c8183
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00026-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f49c774e63ec4e6bc83abfa52c52f5e983e792117fc98127ecd126fb0ceab05b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00027-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00027-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cfa80e3736fe3ff841796184e9d1c918a77a9302
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00027-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d37dec4d6f6a3725e4ea1f6828c035d435fcee48afb98240c3e0b9a90b4bd8f
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00028-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00028-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b4d5daff9447a43f229250503d8ab9eabb84ded9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00028-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:569dcda2dda07b506ee8d2b7c80bac8b537c20be63557fe2045f0dbbca16c496
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00029-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00029-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..96c6c1d9b773d48aeb76ff427e1e13fefce94ae2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00029-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b80127c21de71a21214320d5cf4961dc6a4421c0a7cb992d12e3eef2ad72b2f3
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00030-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00030-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..efe413102dd0fdcbb56da81891838cc17b6e1216
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00030-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a23827300290a7670b73d5df71d0ab3040ca73730067d88e9c01c8220d947514
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00031-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00031-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1a5a658515e198161cdd082ee32c69e050d0cdcc
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00031-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d52100571566cf864d43aaa4a1ff913dc15754ac8dcfdc49ada1f890dc4b7f30
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00032-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00032-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8743dfcf5a6546ed26253a7007f839e7d31e7963
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00032-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c7e6821579ce127b7221aa6d94bdb66bad8fc19a5b9efcf21d4025c2a0a8b72
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00033-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00033-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7949aa63a7a1033acc98da1b008955d408ecd22f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00033-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de1e1d29781559105b1f81691d39fe72940df021f11a80199aa6f3b1eb12c31b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00034-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00034-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..25248eb6f0cf51529c5142b428f3c5048c387c25
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00034-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58feab6b7ad0a25346ea9217d4cdf54475f7e6f24772eec14e4dcf96439c91c1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00035-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00035-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7a800b8164751e012d3668e359598110d7107531
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00035-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42bf655796e20648b41fb3fbd034df42bc96095077b48d92e4b104eaeaa92c89
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00036-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00036-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..999c072d74560a1e3e1239719bba877c37474271
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00036-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cb2d67f210ce67a5eba47ffebc73e6dedaca4048005a6e08cd7b8b099851db5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00037-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00037-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..50cc1c11b6f8380f2a6a1099e017c9674e2980bb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00037-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1549648214676c1a79055e3e3a41ccf1c214194aef247c417055bad666fe00f1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00038-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00038-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5959c3a48e6b6f36a6122b98e9d922ca63e1d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00038-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96039c505d8dc9ccbd80e0bf9e59da1ba25d54c78369ae4aa0dfb26eb58435e0
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00039-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00039-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ebc733d53e5873d699a5daf466ac6113de26d249
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00039-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d779eb1970c79034b7821303e2bed08aee1977260f8639b1d862c86829207644
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00040-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00040-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e8c14ddaa1a73981cf499cb4870203a322a212b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00040-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e25115b7661eb0281e4e1998c557cc4fd9540e93b39d3aea22282eaa967a5ad
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00041-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00041-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..11038fef6c74893556c5703f0f6b51fa2940f9f9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00041-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da6e73916b9300a0bf4ddd209895d88affce60c1781c1e391214dfccd5838e74
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00042-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00042-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b22ccc12ca308d8fa43ee6684a2d9f7a81979015
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00042-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:152a2d2c93e6d97fa29564b4c19b78ef303bd6cb3d0d92d1952eb9579ec6ff49
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00043-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00043-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c664c6a5d17a344959de11add8d2c6fba3dbf25c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00043-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0519b05106a7ed60c427712622e624b502814f41f05cd4689df1462bcb36c3e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00044-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00044-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f92b167b8d702ea537c3cf81403be80b07a0cf16
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00044-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:745d6bc7ee6723525c5d725e6c7ab25f2dba16dee1bde846dd0b0df216b43fa0
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00045-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00045-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fccec31760079d351fbbd983959b06280d21c65b
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00045-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e13f9737b72c80107a6fda187af26595e2f8e583d03a90a1c82a6a5e32e9dc00
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00046-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00046-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..23f5120c0d63fc9a625a18ed9687e45d04aace0c
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00046-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:795f70af043b17ef87b432b1dae016223ea22b74d32034bdf853474ad05faab9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00047-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00047-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bdfbe19917624ad0eb012d655be689d6b0e95639
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00047-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89ba8f420060a9d1f3e70af51d122e888b9a16812d6bef6ed5b4f7ee2d276a6d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00048-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00048-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bad74f4ba34e96468aa7e92ae5b790e31961ea71
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00048-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0ca1845bde9d1ba3b6f3a4b7c97db0d77a1a10483cd0ed15330bcad754713e7
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00049-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00049-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e9f684d61f5ca3359b3fb916fc7dda8f667286e4
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00049-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a35c5a1006fc4f34a3d26b6cc94690614fee67dc9e965403d674de9c3a09d747
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00050-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00050-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e8dd0bfcbafaa8e53022a3266780fe7e40535231
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00050-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0144910c3b10d76ca06f33fae5453b93e9f97e72450cc4528bdafedb88d062f9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00051-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00051-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e5bc7cac170400a6d748596cb21089b00a6d4a25
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00051-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:484f9bc1e8503668ba075b417b12a1f1fc1577a70f7114ffcdb78760c6ba1204
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00052-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00052-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1484ed2c10b488c40255384f1679a98b0a09697f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00052-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45191a8e436fa1142329a47f1a944b51883629f253fd1dca9b749e841ff179e8
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model-00053-of-00053.bin b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00053-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f0dc0c66816d2131a05c16fe96d21b3dab178042
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model-00053-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a45825e14063223459d1ac24bba572706504471d3ee4d76026f828da3a24135d
+size 1207656908
diff --git a/model_hubs/Skywork-13B-Base-2T/pytorch_model.bin.index.json b/model_hubs/Skywork-13B-Base-2T/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..163c37a78b34efe7cc858ea3fdca93e4c7c25699
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/pytorch_model.bin.index.json
@@ -0,0 +1 @@
+{"metadata": {"total_size": 27708239872}, "weight_map": {"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.up_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.down_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.2.input_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.up_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.down_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.3.input_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.up_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.down_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.4.input_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.up_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.down_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.5.input_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.up_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.down_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.6.input_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.up_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.down_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.7.input_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.up_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.down_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.8.input_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.up_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.down_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.9.input_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.up_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.down_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.10.input_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.up_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.down_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.11.input_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.up_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.down_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.12.input_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.up_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.down_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.13.input_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.up_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.down_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.14.input_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.up_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.down_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.15.input_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.up_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.down_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.16.input_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.up_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.down_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.17.input_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.up_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.down_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.18.input_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.up_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.down_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.19.input_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.up_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.down_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.20.input_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.up_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.down_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.21.input_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.up_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.down_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.22.input_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.up_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.down_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.23.input_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.up_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.down_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.24.input_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.up_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.down_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.25.input_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.up_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.down_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.26.input_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.up_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.down_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.27.input_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.up_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.down_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.28.input_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.up_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.down_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.29.input_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.up_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.down_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.30.input_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.up_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.down_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.31.input_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.up_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.down_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.32.input_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.up_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.down_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.33.input_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.up_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.down_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.34.input_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.up_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.down_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.35.input_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.up_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.down_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.36.input_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.up_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.down_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.37.input_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.up_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.down_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.38.input_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.up_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.down_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.39.input_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.up_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.down_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.40.input_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.rotary_emb.inv_freq": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.up_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.down_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.41.input_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.rotary_emb.inv_freq": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.up_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.down_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.42.input_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.rotary_emb.inv_freq": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.up_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.down_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.43.input_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.rotary_emb.inv_freq": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.up_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.down_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.44.input_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.rotary_emb.inv_freq": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.up_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.down_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.45.input_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.rotary_emb.inv_freq": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.up_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.down_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.46.input_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.rotary_emb.inv_freq": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.up_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.down_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.47.input_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.rotary_emb.inv_freq": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.up_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.down_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.48.input_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.rotary_emb.inv_freq": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.up_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.down_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.49.input_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.rotary_emb.inv_freq": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.up_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.down_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.50.input_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.rotary_emb.inv_freq": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.up_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.down_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.51.input_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.rotary_emb.inv_freq": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.up_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.down_proj.weight": "pytorch_model-00052-of-00053.bin", "model.norm.weight": "pytorch_model-00053-of-00053.bin", "model.embed_tokens.weight": "pytorch_model-00053-of-00053.bin", "lm_head.weight": "pytorch_model-00053-of-00053.bin"}}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-2T/special_tokens_map.json b/model_hubs/Skywork-13B-Base-2T/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..d85ba6cb6820b01226ef8bd40b46bb489041c6a8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-2T/tokenization_skywork.py b/model_hubs/Skywork-13B-Base-2T/tokenization_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac378d77d2d90d17340b3cb8eaf91bdb1656b71d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/tokenization_skywork.py
@@ -0,0 +1,250 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+"""Tokenization classes for Skywork."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+if TYPE_CHECKING:
+ from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<>\n", "\n<>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+class SkyworkTokenizer(PreTrainedTokenizer):
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token=None,
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ clean_up_tokenization_spaces=False,
+ legacy=True,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+ self.legacy = legacy
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ legacy=legacy,
+ **kwargs,
+ )
+ if legacy:
+ logger.warning_once(
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
+ )
+
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+ def tokenize(self, text, **kwargs) -> List[str]:
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+ # the beginning of the text
+ if not self.legacy:
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+ return super().tokenize(text, **kwargs)
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+ def _tokenize(self, text):
+ if not self.legacy:
+ is_first = text.startswith(SPIECE_UNDERLINE)
+ if is_first:
+ text = text[1:]
+
+ tokens = self.sp_model.encode(text, out_type=str)
+
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+ return tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for i, token in enumerate(tokens):
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special and i != 0:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ return out_string
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = bos_token_id + token_ids_0 + eos_token_id
+
+ if token_ids_1 is not None:
+ output = output + bos_token_id + token_ids_1 + eos_token_id
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ bos_token_id = [1] if self.add_bos_token else []
+ eos_token_id = [1] if self.add_eos_token else []
+
+ if token_ids_1 is None:
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+ return (
+ bos_token_id
+ + ([0] * len(token_ids_0))
+ + eos_token_id
+ + bos_token_id
+ + ([0] * len(token_ids_1))
+ + eos_token_id
+ )
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+ if token_ids_1 is not None:
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+ return output
+
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+ dialogue = list(conversation.iter_texts())
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+ [not is_user for is_user, msg in dialogue[1::2]]
+ ):
+ raise ValueError(
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+ )
+
+ dialog_tokens: List[int] = []
+ if len(conversation.past_user_inputs) > 0:
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+ conversation.past_user_inputs[0] = (
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+ )
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+ dialog_tokens += sum(
+ [
+ [self.bos_token_id]
+ + self.encode(
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+ )
+ + [self.eos_token_id]
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+ ],
+ [],
+ )
+ if not (dialogue[-1][0]):
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+ dialog_tokens += [self.bos_token_id] + self.encode(
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+ )
+ return dialog_tokens
diff --git a/model_hubs/Skywork-13B-Base-2T/tokenizer.model b/model_hubs/Skywork-13B-Base-2T/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..decbfe220922d6a38ff52541ef3927b97fb7893e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
+size 994250
diff --git a/model_hubs/Skywork-13B-Base-2T/tokenizer_config.json b/model_hubs/Skywork-13B-Base-2T/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c232b8b78a3ad2ce894b9a17628f3821627ccd7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-2T/tokenizer_config.json
@@ -0,0 +1,40 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": null,
+ "sp_model_kwargs": {},
+ "tokenizer_class": "SkyworkTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_skywork.SkyworkTokenizer",
+ null
+ ]
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-3T/config.json b/model_hubs/Skywork-13B-Base-3T/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..176a4ca6fc2d7e436819a6c762c7967edb3a7b3f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/config.json
@@ -0,0 +1,27 @@
+{
+ "architectures": [
+ "SkyworkForCausalLM"
+ ],
+ "auto_map": {
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
+ },
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 0,
+ "hidden_act": "silu",
+ "hidden_size": 4608,
+ "initializer_range": 0.01,
+ "intermediate_size": 12288,
+ "max_position_embeddings": 131072,
+ "model_type": "skywork",
+ "num_attention_heads": 36,
+ "num_hidden_layers": 52,
+ "num_key_value_heads": 36,
+ "rms_norm_eps": 1e-06,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.33.1",
+ "use_cache": true,
+ "vocab_size": 65519
+ }
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-3T/configuration_skywork.py b/model_hubs/Skywork-13B-Base-3T/configuration_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbad8ae1e08d431a14c5de719267629feb4cd5a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/configuration_skywork.py
@@ -0,0 +1,89 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class SkyworkConfig(PretrainedConfig):
+
+ model_type = "skywork"
+ keys_to_ignore_at_inference = ["past_key_values"]
+
+ def __init__(
+ self,
+ vocab_size=32000,
+ hidden_size=4096,
+ intermediate_size=11008,
+ num_hidden_layers=32,
+ num_attention_heads=32,
+ num_key_value_heads=None,
+ hidden_act="silu",
+ max_position_embeddings=2048,
+ initializer_range=0.02,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=1,
+ eos_token_id=2,
+ pretraining_tp=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ rope_scaling=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+
+ # for backward compatibility
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.initializer_range = initializer_range
+ self.rms_norm_eps = rms_norm_eps
+ self.pretraining_tp = pretraining_tp
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
+ self._rope_scaling_validation()
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ def _rope_scaling_validation(self):
+ """
+ Validate the `rope_scaling` configuration.
+ """
+ if self.rope_scaling is None:
+ return
+
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+ raise ValueError(
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+ f"got {self.rope_scaling}"
+ )
+ rope_scaling_type = self.rope_scaling.get("type", None)
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "ntk"]:
+ raise ValueError(
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+ )
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/model_hubs/Skywork-13B-Base-3T/generation_config.json b/model_hubs/Skywork-13B-Base-3T/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..aece903f676603332b5bc1b1a29d6e44a8c02464
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/generation_config.json
@@ -0,0 +1,10 @@
+{
+ "bos_token_id": 1,
+ "do_sample": true,
+ "eos_token_id": 2,
+ "max_length": 4096,
+ "pad_token_id": 0,
+ "temperature": 0.6,
+ "top_p": 0.9,
+ "transformers_version": "4.33.1"
+}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-3T/modeling_skywork.py b/model_hubs/Skywork-13B-Base-3T/modeling_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d2898e0e7d379dc6883c4e34043e537689b8bb
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/modeling_skywork.py
@@ -0,0 +1,911 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_skywork import SkyworkConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SkyworkConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ """
+ Make causal mask used for bi-directional self-attention.
+ """
+ bsz, tgt_len = input_ids_shape
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+ mask_cond = torch.arange(mask.size(-1), device=device)
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+ mask = mask.to(dtype)
+
+ if past_key_values_length > 0:
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ bsz, src_len = mask.size()
+ tgt_len = tgt_len if tgt_len is not None else src_len
+
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+ inverted_mask = 1.0 - expanded_mask
+
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class SkyworkRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ SkyworkRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class SkyworkRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+
+class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ t = t / self.scaling_factor
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+
+class SkyworkNTKScalingRotaryEmbedding(torch.nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=100, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base * scaling_factor
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ if seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+ )
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class SkyworkMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ if self.config.pretraining_tp > 1:
+ slice = self.intermediate_size // self.config.pretraining_tp
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+ gate_proj = torch.cat(
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+ )
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+ down_proj = [
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+ ]
+ down_proj = sum(down_proj)
+ else:
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+ return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class SkyworkAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+
+ if (self.head_dim * self.num_heads) != self.hidden_size:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+ self._init_rope()
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = SkyworkRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "ntk":
+ self.rotary_emb = SkyworkNTKScalingRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+ print('-'*80)
+ print(f"USING COSTOM MODELING, scaling_type is {scaling_type}, scaling_factor is {scaling_factor}")
+
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.config.pretraining_tp > 1:
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+ query_slices = self.q_proj.weight.split(
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+ )
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+ query_states = torch.cat(query_states, dim=-1)
+
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+ key_states = torch.cat(key_states, dim=-1)
+
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+ value_states = torch.cat(value_states, dim=-1)
+
+ else:
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ kv_seq_len = key_states.shape[-2]
+ if past_key_value is not None:
+ kv_seq_len += past_key_value[0].shape[-2]
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+ if past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ past_key_value = (key_states, value_states) if use_cache else None
+
+ # repeat k/v heads if n_kv_heads < n_heads
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+ f" {attn_weights.size()}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+ if self.config.pretraining_tp > 1:
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+ else:
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+class SkyworkDecoderLayer(nn.Module):
+ def __init__(self, config: SkyworkConfig):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = SkyworkAttention(config=config)
+ self.mlp = SkyworkMLP(config)
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ """
+
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+class SkyworkPreTrainedModel(PreTrainedModel):
+ config_class = SkyworkConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["SkyworkDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, SkyworkModel):
+ module.gradient_checkpointing = value
+
+class SkyworkModel(SkyworkPreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
+
+ Args:
+ config: SkyworkConfig
+ """
+
+ def __init__(self, config: SkyworkConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+ # create causal mask
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ combined_attention_mask = None
+ if input_shape[-1] > 1:
+ combined_attention_mask = _make_causal_mask(
+ input_shape,
+ inputs_embeds.dtype,
+ device=inputs_embeds.device,
+ past_key_values_length=past_key_values_length,
+ )
+
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+ inputs_embeds.device
+ )
+ combined_attention_mask = (
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+ )
+
+ return combined_attention_mask
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape
+ elif inputs_embeds is not None:
+ batch_size, seq_length, _ = inputs_embeds.shape
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ seq_length_with_past = seq_length
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+ seq_length_with_past = seq_length_with_past + past_key_values_length
+
+ if position_ids is None:
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+ )
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+ else:
+ position_ids = position_ids.view(-1, seq_length).long()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+ # embed positions
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+ )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+ )
+
+ hidden_states = inputs_embeds
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, past_key_value, output_attentions)
+
+ return custom_forward
+
+ layer_outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(decoder_layer),
+ hidden_states,
+ attention_mask,
+ position_ids,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ hidden_states = self.norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+
+class SkyworkForCausalLM(SkyworkPreTrainedModel):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = SkyworkModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = outputs[0]
+ if self.config.pretraining_tp > 1:
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+ logits = torch.cat(logits, dim=-1)
+ else:
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
+ shift_labels = shift_labels.view(-1)
+ # Enable model parallelism
+ shift_labels = shift_labels.to(shift_logits.device)
+ loss = loss_fct(shift_logits, shift_labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ return (loss,) + output if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def prepare_inputs_for_generation(
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+ )
+ return reordered_past
+
+
+class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+ self.num_labels = config.num_labels
+ self.model = SkyworkModel(config)
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.model.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.model(
+ input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+ logits = self.score(hidden_states)
+
+ if input_ids is not None:
+ batch_size = input_ids.shape[0]
+ else:
+ batch_size = inputs_embeds.shape[0]
+
+ if self.config.pad_token_id is None and batch_size != 1:
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+ if self.config.pad_token_id is None:
+ sequence_lengths = -1
+ else:
+ if input_ids is not None:
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+ logits.device
+ )
+ else:
+ sequence_lengths = -1
+
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+ loss = None
+ if labels is not None:
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(pooled_logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(pooled_logits, labels)
+ if not return_dict:
+ output = (pooled_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return SequenceClassifierOutputWithPast(
+ loss=loss,
+ logits=pooled_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9303b7bd1e0c4c1eed4bba66ac2ed298e222e707
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00001-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69626f67345dd2378ea1155f152804fb4886b151f2e43ebe3b2d6f33c80e606e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dcdb1009bafbfb50ce114291f5f07f33ea5be3b2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00002-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7b651c6dde0c0a430a94dce24d3560bd07db9ed35f1f1cac9edd530e441b5f0
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5d5fdd9878d5120cef78cf977aff6e879e53ba8f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00003-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8359b7ecc78b02a619751c96f60aec6fee4a2595db3f36cd61a5391838fc7ce1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4024f0cf462272635fbf74ec65a80485704ef0e3
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00004-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69c333cb8fcfbe365e3bdcd260e5ff91601da65662a5718d002639937ef3cefb
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..63fb4ca1846d2cf5643a9e0e0f50e78e1335a607
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00005-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4277ccd1d2175e039075cc6fe2b95e213a590e9eabd35ef26785b998b4f2ad84
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..574726695542cd009aa60e9a5ed445a95b34aeef
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00006-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:596649ca586587c17956487cda102ce7ce3c5c950ad89ba1bb8c9ef9224b5a01
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6012fb8d748d3dc45c04f3e5d2fe614eb2c67de3
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00007-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:953fdae0c98a276f579647ab7595cf9548ec3e46cc433364ba23cfa9b2a77e0a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..81e5ee182890730806fc79a669c386d93a76fa4f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00008-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9648394bc03bc87c8913c31893fbdc55dbbbfeaf7041fc4e5b4946469261f026
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..481f3aeda71f79884e3f495815184f6bd3c6808f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00009-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72c026f7373e85d29e17dd6501c0622da44211273993cc641da8551468a8063d
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6cb445096e2cfc9a7e582b4dce625fabcd1814d8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00010-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b1f53581597d14404ffd763f646f2a8346f89d3e84fd17f88ea0bc779bcd8cf
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..60b7d4629b2acfb518c405970cf59848a87dc9dd
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00011-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4be0dc98193a0e3f421c7d2ebe1fa910ea62d70e9bb32b0f4cac7b69326c550c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..41468e1d8288ca66cc1448da451c226a040a69a6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00012-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94f369d5ba5c26dbf9b2b4ab27803068b220db8a38db5ac644eb24fa79a7b963
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4ae8dfaf384070f2ef564c0a71538a15d17eadc9
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00013-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b766e0e9ceb543f6ca5ebd5e4dac937bf58a7e14a3d13e1ae6d50fec820bfb8b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..92f6d874ba17d482860ab107ef6f1deda2e8e9fa
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00014-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5806356cccd558f1d05643ba00e23db77a8c06a174bfa5831efcd8283582776
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d0c0586f395d56fea43df48e08037f3b8ee208a0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00015-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d198045292b6d3d0a78f907d67491541e4e5768feb9dfcde5dda18a7f8d8cdfa
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..78bc25d0818412b843a18d3bd5c3d62b17e1e706
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00016-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db78b41b807f6c7b1bb6aa7372dbc14c9da9ce0cd4d7c4e89955cfa6e4400f0b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56d1a731b4c11f3dbe1d719f64adc5a2f58711b1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00017-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87ef520c42607a36a63f8c7e5e6513e46ccdcbaa077f6a7fa0a17f5663a19cae
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1ca52ae7937f95f69459ae1a2f6bdae86feb15f2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00018-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a43230a0381dc5d4a45f874376d6d1b7c24fb02dd6d9ad62dc934fa4bdbf22c9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d80b64ef4bc1cd6eabfbe096b357bb653dc0ed97
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00019-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ea20023ae3238034245a47e2d782fa43798074640af65d355d08b5e2c7a3968
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d0fe303c992941ba78a23a881c45da8bc4b45111
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00020-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18465d3d867b0c517d4dcdf61c8088e26a7a16c42d5d17e6efa1d685725d5028
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..922d42ae5fb9fbbf9de285483b646e378260b49f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00021-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:977e108f2584ff9a4f846f3ec4b515c4e28216b1439df82adb7e611a452929a7
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dce9fe1c16c68f81ff9dd601f7582ec91925ad30
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00022-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7236496347fda53cf7d1c4daef8c380b6e42d1ec0a6edb44fd9d0e38b6db6419
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9d428b9c6d69735100d76b81c47b0ce76807de23
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00023-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b17bedf66127af7638e660befbc5defd047024d8b63fcd590626af272e3c5cc1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..efd20cdb648b1be9123028d24e07a1476115b4f3
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00024-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e76c6c43337c94f45fd1b3f3365d14dfabbfd38cfc9074db75b18d9504052d67
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..672fc5bddd84cd5fbfa955e0b553a00c6f840f57
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00025-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:250d260874617a3944545996ba6bd0291b5c748a6e8c603a235fb1f882a93b47
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cf3dcf75f8dd2e212cf73cc561e3cb36efb544e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00026-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52023afeed136ba561793db69a90c23655f637175890e19080a01f62f87cdac9
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6c2a64b4b153e2785898051928094355946da704
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00027-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54d3e82f5176315c470e4615fd5e1be85383351077fcceb684ab436f3e6796a1
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3d8ac9971e13ccde8cfa6e35889c7e1cd49eebc6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00028-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7aff27ef911125a991c6b58d8e809b319e847ff558cd12d49b025e06d3c5728e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..021f0c6884915f5bbad2ce59add724002d7c795f
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00029-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1832c51bef229f9014fa518c5cff4dcc39723ab853c880335796f5740914c80
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef1154485bf75cac5ec4aba6f77743698a0b725d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00030-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a72d1a732e9a99abf661f5e109c589635d1a86886bac7f2aa80a7b6c409f8be
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..05ea64b551989a86be125b2c6517d56b364361c4
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00031-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4114c138eb5a1bf68bd29a125c31e24201c3a5cf95ec3757c2b2f366b45befe6
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4d8187ebb92ce48307016ab9bc49b4d5b8fa3a8d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00032-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31aaa207f4a4c1042868c0320cec86c8f7aed5ba0940c2552f9c6be88526398a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..14b2e3af217d371f819d0edaa3e3a55e26609dd5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00033-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abf15bdd7289fabf72251cd45026e78b6bdd5c8a8c849d3ab4658521a9b5383e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f4b8ae797e2d88866545c33b82c3b1e198cc6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00034-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7ce1cb13ef764d3da8c01e65a7cb12e9b78fc7410555441851ad05138cbfdbc
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0375cdd22a38fe10a37583b404a5e62c98fd47e5
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00035-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c95670bea6f294572d6e0c7d6f410a378ed546c8beb239ee86d4b858574096
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..37b06dd03e963ce25d9905478a30e9e40cdb9ee2
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00036-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de7e7cc3ece84abbc58aa053644369e5efce366da96dfd94a70477eaacc4edcc
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ea37af207c1e8b71536cae9251fc36fbff4bcfe
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00037-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46c8254b6f12d8bb032f4cfa9c4596f06692eb307cfd2a617d7c4639282fdcf4
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6e786f61dc0a4454552c2130124ca3cd2d48b8d8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00038-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6784ebd555e163c58262116bd192a2bc2679d30311c8387fb8a652f9ae3d082b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8abae969626746f1d9648ed3a78bda0993616fe6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00039-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30db78de4bdbedd9066f08a8f0fc4fa6558ec22503df34d289bcad5dee26dbb8
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c94c8fe92d17c6dc5c5d241f95c554e768bed582
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00040-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:571fe808760c495bf1b97396ec9ef4ce3bd5c1003d692dfcb7d3ad7197023a67
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a62db24345c4816505c53beb85ae4017f3ff6f56
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00041-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ca91a4280c92688d97088bf2d3f1a0da6dcbed864f91f73ae7cec4bfc8496a0
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..284845b41de77d53b1263c9eea855e595190cfa0
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00042-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:572a6a020f42889d4e41596017022e880a4565cac9fa7b6072e6db21955dee78
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f673217e823bb804fe724f26ec97411b100ecac6
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00043-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59ebb7958ba6de1cd89e31ef83e6786eacc07d8de467234543a6356466e820e0
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..be8fcca5a22cc82a0dbc2bfd4ec35c6faf4942a1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00044-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b364a668f019e0896567dfcde7ed9d131bef3b602b4bb9e932430820ca4101
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..237fc1d08a95e3e454c33aa901cb79f97f45649d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00045-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c4819c9218302cfe4ad737bc079eb75aff0c58f854e82d391de66b8ae8724c
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00046-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00046-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..015a269508b74e3615d58284fee375abf18abfff
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00046-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80692b714d33ec725a47009bec70a296365667698ea8d8b49c48d49cf753f969
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00047-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00047-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1f9c9d4d9f5aa5316ff01205bf479642bf8b8455
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00047-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf8cf7e93f03160ef12731981c49e2295a85df6daa06aeb5f9ba02c486dabb6a
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00048-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00048-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e2c01149adf9d712ad6ed00049f00260aa32e0f1
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00048-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08ea3cc15bd70ec948e83cdce1f857dd7580c2924acddec3215469c59b9c256b
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00049-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00049-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d6fdd0c26be778a728c2fba5e54e31198c73c684
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00049-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25dcf34c85d0475596421183723a163a1a09b9964f9375ed671935d0060994dd
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00050-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00050-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e9681156ba7959f9f007aa6d9a60c5076fa2af08
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00050-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd81aa8a0213eb25a9272944191d3c504de4eb60cba099b8c9f29093b55034e
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00051-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00051-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a8bd944ef9e20bc610746133db68a98e1673c09a
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00051-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e815bbf127cb17a72b0be5bf29d03fcd42b9d2e7824f96ff0c897433fe22f31
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00052-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00052-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b4b3a925263fbdd2695e5058d70c059ec2f6c68d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00052-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a52f7e43ffa1d8003d4671d77c2ad6e9f7495889b2af607b42c0055a0aeef8f5
+size 509630194
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model-00053-of-00053.bin b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00053-of-00053.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ff029f005d1bad7adeef623f62b3d864a8fa4acf
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model-00053-of-00053.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fad3641e4921e518a700f4d6c0b86299ee2343b263190e7bce5968e98afde8d
+size 1207656908
diff --git a/model_hubs/Skywork-13B-Base-3T/pytorch_model.bin.index.json b/model_hubs/Skywork-13B-Base-3T/pytorch_model.bin.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..163c37a78b34efe7cc858ea3fdca93e4c7c25699
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/pytorch_model.bin.index.json
@@ -0,0 +1 @@
+{"metadata": {"total_size": 27708239872}, "weight_map": {"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00053.bin", "model.layers.1.input_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.up_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.1.mlp.down_proj.weight": "pytorch_model-00002-of-00053.bin", "model.layers.2.input_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.up_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.2.mlp.down_proj.weight": "pytorch_model-00003-of-00053.bin", "model.layers.3.input_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.up_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.3.mlp.down_proj.weight": "pytorch_model-00004-of-00053.bin", "model.layers.4.input_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.up_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.4.mlp.down_proj.weight": "pytorch_model-00005-of-00053.bin", "model.layers.5.input_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.up_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.5.mlp.down_proj.weight": "pytorch_model-00006-of-00053.bin", "model.layers.6.input_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.up_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.6.mlp.down_proj.weight": "pytorch_model-00007-of-00053.bin", "model.layers.7.input_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.up_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.7.mlp.down_proj.weight": "pytorch_model-00008-of-00053.bin", "model.layers.8.input_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.up_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.8.mlp.down_proj.weight": "pytorch_model-00009-of-00053.bin", "model.layers.9.input_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.up_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.9.mlp.down_proj.weight": "pytorch_model-00010-of-00053.bin", "model.layers.10.input_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.up_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.10.mlp.down_proj.weight": "pytorch_model-00011-of-00053.bin", "model.layers.11.input_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.up_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.11.mlp.down_proj.weight": "pytorch_model-00012-of-00053.bin", "model.layers.12.input_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.up_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.12.mlp.down_proj.weight": "pytorch_model-00013-of-00053.bin", "model.layers.13.input_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.up_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.13.mlp.down_proj.weight": "pytorch_model-00014-of-00053.bin", "model.layers.14.input_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.up_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.14.mlp.down_proj.weight": "pytorch_model-00015-of-00053.bin", "model.layers.15.input_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.up_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.15.mlp.down_proj.weight": "pytorch_model-00016-of-00053.bin", "model.layers.16.input_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.up_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.16.mlp.down_proj.weight": "pytorch_model-00017-of-00053.bin", "model.layers.17.input_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.up_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.17.mlp.down_proj.weight": "pytorch_model-00018-of-00053.bin", "model.layers.18.input_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.up_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.18.mlp.down_proj.weight": "pytorch_model-00019-of-00053.bin", "model.layers.19.input_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.up_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.19.mlp.down_proj.weight": "pytorch_model-00020-of-00053.bin", "model.layers.20.input_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.up_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.20.mlp.down_proj.weight": "pytorch_model-00021-of-00053.bin", "model.layers.21.input_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.up_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.21.mlp.down_proj.weight": "pytorch_model-00022-of-00053.bin", "model.layers.22.input_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.up_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.22.mlp.down_proj.weight": "pytorch_model-00023-of-00053.bin", "model.layers.23.input_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.up_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.23.mlp.down_proj.weight": "pytorch_model-00024-of-00053.bin", "model.layers.24.input_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.up_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.24.mlp.down_proj.weight": "pytorch_model-00025-of-00053.bin", "model.layers.25.input_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.up_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.25.mlp.down_proj.weight": "pytorch_model-00026-of-00053.bin", "model.layers.26.input_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.up_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.26.mlp.down_proj.weight": "pytorch_model-00027-of-00053.bin", "model.layers.27.input_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.up_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.27.mlp.down_proj.weight": "pytorch_model-00028-of-00053.bin", "model.layers.28.input_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.up_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.28.mlp.down_proj.weight": "pytorch_model-00029-of-00053.bin", "model.layers.29.input_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.up_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.29.mlp.down_proj.weight": "pytorch_model-00030-of-00053.bin", "model.layers.30.input_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.up_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.30.mlp.down_proj.weight": "pytorch_model-00031-of-00053.bin", "model.layers.31.input_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.up_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.31.mlp.down_proj.weight": "pytorch_model-00032-of-00053.bin", "model.layers.32.input_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.up_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.32.mlp.down_proj.weight": "pytorch_model-00033-of-00053.bin", "model.layers.33.input_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.up_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.33.mlp.down_proj.weight": "pytorch_model-00034-of-00053.bin", "model.layers.34.input_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.up_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.34.mlp.down_proj.weight": "pytorch_model-00035-of-00053.bin", "model.layers.35.input_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.up_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.35.mlp.down_proj.weight": "pytorch_model-00036-of-00053.bin", "model.layers.36.input_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.up_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.36.mlp.down_proj.weight": "pytorch_model-00037-of-00053.bin", "model.layers.37.input_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.up_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.37.mlp.down_proj.weight": "pytorch_model-00038-of-00053.bin", "model.layers.38.input_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.up_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.38.mlp.down_proj.weight": "pytorch_model-00039-of-00053.bin", "model.layers.39.input_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.up_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.39.mlp.down_proj.weight": "pytorch_model-00040-of-00053.bin", "model.layers.40.input_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.self_attn.rotary_emb.inv_freq": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.up_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.40.mlp.down_proj.weight": "pytorch_model-00041-of-00053.bin", "model.layers.41.input_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.self_attn.rotary_emb.inv_freq": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.up_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.41.mlp.down_proj.weight": "pytorch_model-00042-of-00053.bin", "model.layers.42.input_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.self_attn.rotary_emb.inv_freq": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.up_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.42.mlp.down_proj.weight": "pytorch_model-00043-of-00053.bin", "model.layers.43.input_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.self_attn.rotary_emb.inv_freq": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.up_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.43.mlp.down_proj.weight": "pytorch_model-00044-of-00053.bin", "model.layers.44.input_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.self_attn.rotary_emb.inv_freq": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.up_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.44.mlp.down_proj.weight": "pytorch_model-00045-of-00053.bin", "model.layers.45.input_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.self_attn.rotary_emb.inv_freq": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.up_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.45.mlp.down_proj.weight": "pytorch_model-00046-of-00053.bin", "model.layers.46.input_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.self_attn.rotary_emb.inv_freq": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.up_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.46.mlp.down_proj.weight": "pytorch_model-00047-of-00053.bin", "model.layers.47.input_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.self_attn.rotary_emb.inv_freq": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.up_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.47.mlp.down_proj.weight": "pytorch_model-00048-of-00053.bin", "model.layers.48.input_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.self_attn.rotary_emb.inv_freq": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.up_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.48.mlp.down_proj.weight": "pytorch_model-00049-of-00053.bin", "model.layers.49.input_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.self_attn.rotary_emb.inv_freq": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.up_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.49.mlp.down_proj.weight": "pytorch_model-00050-of-00053.bin", "model.layers.50.input_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.self_attn.rotary_emb.inv_freq": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.up_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.50.mlp.down_proj.weight": "pytorch_model-00051-of-00053.bin", "model.layers.51.input_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.self_attn.rotary_emb.inv_freq": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.up_proj.weight": "pytorch_model-00052-of-00053.bin", "model.layers.51.mlp.down_proj.weight": "pytorch_model-00052-of-00053.bin", "model.norm.weight": "pytorch_model-00053-of-00053.bin", "model.embed_tokens.weight": "pytorch_model-00053-of-00053.bin", "lm_head.weight": "pytorch_model-00053-of-00053.bin"}}
\ No newline at end of file
diff --git a/model_hubs/Skywork-13B-Base-3T/special_tokens_map.json b/model_hubs/Skywork-13B-Base-3T/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..d85ba6cb6820b01226ef8bd40b46bb489041c6a8
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/model_hubs/Skywork-13B-Base-3T/tokenization_skywork.py b/model_hubs/Skywork-13B-Base-3T/tokenization_skywork.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac378d77d2d90d17340b3cb8eaf91bdb1656b71d
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/tokenization_skywork.py
@@ -0,0 +1,250 @@
+# Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
+# This code is built upon Huggingface's transformers repository.
+
+"""Tokenization classes for Skywork."""
+import os
+from shutil import copyfile
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+if TYPE_CHECKING:
+ from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<>\n", "\n<>\n\n"
+
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
+
+class SkyworkTokenizer(PreTrainedTokenizer):
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token=None,
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
+ add_bos_token=True,
+ add_eos_token=False,
+ clean_up_tokenization_spaces=False,
+ legacy=True,
+ **kwargs,
+ ):
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+ self.legacy = legacy
+ self.vocab_file = vocab_file
+ self.add_bos_token = add_bos_token
+ self.add_eos_token = add_eos_token
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.Load(vocab_file)
+ super().__init__(
+ bos_token=bos_token,
+ eos_token=eos_token,
+ unk_token=unk_token,
+ pad_token=pad_token,
+ add_bos_token=add_bos_token,
+ add_eos_token=add_eos_token,
+ sp_model_kwargs=self.sp_model_kwargs,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ legacy=legacy,
+ **kwargs,
+ )
+ if legacy:
+ logger.warning_once(
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
+ )
+
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ state["sp_model"] = None
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+ return state
+
+ def __setstate__(self, d):
+ self.__dict__ = d
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+ @property
+ def vocab_size(self):
+ """Returns vocab size"""
+ return self.sp_model.get_piece_size()
+
+ def get_vocab(self):
+ """Returns vocab as a dict"""
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+ vocab.update(self.added_tokens_encoder)
+ return vocab
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+ def tokenize(self, text, **kwargs) -> List[str]:
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+ # the beginning of the text
+ if not self.legacy:
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+ return super().tokenize(text, **kwargs)
+
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+ def _tokenize(self, text):
+ if not self.legacy:
+ is_first = text.startswith(SPIECE_UNDERLINE)
+ if is_first:
+ text = text[1:]
+
+ tokens = self.sp_model.encode(text, out_type=str)
+
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+ return tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.sp_model.piece_to_id(token)
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ token = self.sp_model.IdToPiece(index)
+ return token
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ current_sub_tokens = []
+ out_string = ""
+ prev_is_special = False
+ for i, token in enumerate(tokens):
+ # make sure that special tokens are not decoded using sentencepiece model
+ if token in self.all_special_tokens:
+ if not prev_is_special and i != 0:
+ out_string += " "
+ out_string += self.sp_model.decode(current_sub_tokens) + token
+ prev_is_special = True
+ current_sub_tokens = []
+ else:
+ current_sub_tokens.append(token)
+ prev_is_special = False
+ out_string += self.sp_model.decode(current_sub_tokens)
+ return out_string
+
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ out_vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+ copyfile(self.vocab_file, out_vocab_file)
+ elif not os.path.isfile(self.vocab_file):
+ with open(out_vocab_file, "wb") as fi:
+ content_spiece_model = self.sp_model.serialized_model_proto()
+ fi.write(content_spiece_model)
+
+ return (out_vocab_file,)
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = bos_token_id + token_ids_0 + eos_token_id
+
+ if token_ids_1 is not None:
+ output = output + bos_token_id + token_ids_1 + eos_token_id
+
+ return output
+
+ def get_special_tokens_mask(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+ ) -> List[int]:
+ if already_has_special_tokens:
+ return super().get_special_tokens_mask(
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+ )
+
+ bos_token_id = [1] if self.add_bos_token else []
+ eos_token_id = [1] if self.add_eos_token else []
+
+ if token_ids_1 is None:
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+ return (
+ bos_token_id
+ + ([0] * len(token_ids_0))
+ + eos_token_id
+ + bos_token_id
+ + ([0] * len(token_ids_1))
+ + eos_token_id
+ )
+
+ def create_token_type_ids_from_sequences(
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+ ) -> List[int]:
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+ if token_ids_1 is not None:
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+ return output
+
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+ dialogue = list(conversation.iter_texts())
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
+ [not is_user for is_user, msg in dialogue[1::2]]
+ ):
+ raise ValueError(
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
+ )
+
+ dialog_tokens: List[int] = []
+ if len(conversation.past_user_inputs) > 0:
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
+ conversation.past_user_inputs[0] = (
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+ )
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
+
+ dialog_tokens += sum(
+ [
+ [self.bos_token_id]
+ + self.encode(
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
+ )
+ + [self.eos_token_id]
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
+ ],
+ [],
+ )
+ if not (dialogue[-1][0]):
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
+ dialog_tokens += [self.bos_token_id] + self.encode(
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
+ )
+ return dialog_tokens
diff --git a/model_hubs/Skywork-13B-Base-3T/tokenizer.model b/model_hubs/Skywork-13B-Base-3T/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..decbfe220922d6a38ff52541ef3927b97fb7893e
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
+size 994250
diff --git a/model_hubs/Skywork-13B-Base-3T/tokenizer_config.json b/model_hubs/Skywork-13B-Base-3T/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c232b8b78a3ad2ce894b9a17628f3821627ccd7
--- /dev/null
+++ b/model_hubs/Skywork-13B-Base-3T/tokenizer_config.json
@@ -0,0 +1,40 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "bos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "clean_up_tokenization_spaces": false,
+ "eos_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "legacy": true,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": null,
+ "sp_model_kwargs": {},
+ "tokenizer_class": "SkyworkTokenizer",
+ "unk_token": {
+ "__type": "AddedToken",
+ "content": "",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false
+ },
+ "auto_map": {
+ "AutoTokenizer": [
+ "tokenization_skywork.SkyworkTokenizer",
+ null
+ ]
+ }
+}
diff --git a/up.sh b/up.sh
new file mode 100644
index 0000000000000000000000000000000000000000..32d4b35c2cbee185a4bb947880b62959f858675a
--- /dev/null
+++ b/up.sh
@@ -0,0 +1,4 @@
+git lfs track model_hubs/*/*.bin
+git add .
+git commit -m "update model and config"
+git push