breezeyoung commited on 4 days ago

Commit

4f07533

verified ·

1 Parent(s): 1326f8a

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +3 -0
added_tokens.json +2081 -0
chat_template.jinja +120 -0
config.json +131 -0
configuration_prts_qwen3_vl.py +345 -0
dit_action_head.py +1230 -0
generation_config.json +12 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +777 -0
modeling_prts_qwen3_vl.py +935 -0
modeling_qwen3_vl.py +1645 -0
preprocessor_config.json +42 -0
processing_prts_qwen3_vl.py +352 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
training_args.bin +3 -0
video_preprocessor_config.json +41 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+crl_eval_results/multi_episodes_evolution.png filter=lfs diff=lfs merge=lfs -text
+crl_eval_results/similarity_heatmap.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,2081 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|action_end|>": 151671,
+  "<|action_pad|>": 151670,
+  "<|action_start|>": 151669,
+  "<|action_token_0|>": 151674,
+  "<|action_token_1000|>": 152674,
+  "<|action_token_1001|>": 152675,
+  "<|action_token_1002|>": 152676,
+  "<|action_token_1003|>": 152677,
+  "<|action_token_1004|>": 152678,
+  "<|action_token_1005|>": 152679,
+  "<|action_token_1006|>": 152680,
+  "<|action_token_1007|>": 152681,
+  "<|action_token_1008|>": 152682,
+  "<|action_token_1009|>": 152683,
+  "<|action_token_100|>": 151774,
+  "<|action_token_1010|>": 152684,
+  "<|action_token_1011|>": 152685,
+  "<|action_token_1012|>": 152686,
+  "<|action_token_1013|>": 152687,
+  "<|action_token_1014|>": 152688,
+  "<|action_token_1015|>": 152689,
+  "<|action_token_1016|>": 152690,
+  "<|action_token_1017|>": 152691,
+  "<|action_token_1018|>": 152692,
+  "<|action_token_1019|>": 152693,
+  "<|action_token_101|>": 151775,
+  "<|action_token_1020|>": 152694,
+  "<|action_token_1021|>": 152695,
+  "<|action_token_1022|>": 152696,
+  "<|action_token_1023|>": 152697,
+  "<|action_token_1024|>": 152698,
+  "<|action_token_1025|>": 152699,
+  "<|action_token_1026|>": 152700,
+  "<|action_token_1027|>": 152701,
+  "<|action_token_1028|>": 152702,
+  "<|action_token_1029|>": 152703,
+  "<|action_token_102|>": 151776,
+  "<|action_token_1030|>": 152704,
+  "<|action_token_1031|>": 152705,
+  "<|action_token_1032|>": 152706,
+  "<|action_token_1033|>": 152707,
+  "<|action_token_1034|>": 152708,
+  "<|action_token_1035|>": 152709,
+  "<|action_token_1036|>": 152710,
+  "<|action_token_1037|>": 152711,
+  "<|action_token_1038|>": 152712,
+  "<|action_token_1039|>": 152713,
+  "<|action_token_103|>": 151777,
+  "<|action_token_1040|>": 152714,
+  "<|action_token_1041|>": 152715,
+  "<|action_token_1042|>": 152716,
+  "<|action_token_1043|>": 152717,
+  "<|action_token_1044|>": 152718,
+  "<|action_token_1045|>": 152719,
+  "<|action_token_1046|>": 152720,
+  "<|action_token_1047|>": 152721,
+  "<|action_token_1048|>": 152722,
+  "<|action_token_1049|>": 152723,
+  "<|action_token_104|>": 151778,
+  "<|action_token_1050|>": 152724,
+  "<|action_token_1051|>": 152725,
+  "<|action_token_1052|>": 152726,
+  "<|action_token_1053|>": 152727,
+  "<|action_token_1054|>": 152728,
+  "<|action_token_1055|>": 152729,
+  "<|action_token_1056|>": 152730,
+  "<|action_token_1057|>": 152731,
+  "<|action_token_1058|>": 152732,
+  "<|action_token_1059|>": 152733,
+  "<|action_token_105|>": 151779,
+  "<|action_token_1060|>": 152734,
+  "<|action_token_1061|>": 152735,
+  "<|action_token_1062|>": 152736,
+  "<|action_token_1063|>": 152737,
+  "<|action_token_1064|>": 152738,
+  "<|action_token_1065|>": 152739,
+  "<|action_token_1066|>": 152740,
+  "<|action_token_1067|>": 152741,
+  "<|action_token_1068|>": 152742,
+  "<|action_token_1069|>": 152743,
+  "<|action_token_106|>": 151780,
+  "<|action_token_1070|>": 152744,
+  "<|action_token_1071|>": 152745,
+  "<|action_token_1072|>": 152746,
+  "<|action_token_1073|>": 152747,
+  "<|action_token_1074|>": 152748,
+  "<|action_token_1075|>": 152749,
+  "<|action_token_1076|>": 152750,
+  "<|action_token_1077|>": 152751,
+  "<|action_token_1078|>": 152752,
+  "<|action_token_1079|>": 152753,
+  "<|action_token_107|>": 151781,
+  "<|action_token_1080|>": 152754,
+  "<|action_token_1081|>": 152755,
+  "<|action_token_1082|>": 152756,
+  "<|action_token_1083|>": 152757,
+  "<|action_token_1084|>": 152758,
+  "<|action_token_1085|>": 152759,
+  "<|action_token_1086|>": 152760,
+  "<|action_token_1087|>": 152761,
+  "<|action_token_1088|>": 152762,
+  "<|action_token_1089|>": 152763,
+  "<|action_token_108|>": 151782,
+  "<|action_token_1090|>": 152764,
+  "<|action_token_1091|>": 152765,
+  "<|action_token_1092|>": 152766,
+  "<|action_token_1093|>": 152767,
+  "<|action_token_1094|>": 152768,
+  "<|action_token_1095|>": 152769,
+  "<|action_token_1096|>": 152770,
+  "<|action_token_1097|>": 152771,
+  "<|action_token_1098|>": 152772,
+  "<|action_token_1099|>": 152773,
+  "<|action_token_109|>": 151783,
+  "<|action_token_10|>": 151684,
+  "<|action_token_1100|>": 152774,
+  "<|action_token_1101|>": 152775,
+  "<|action_token_1102|>": 152776,
+  "<|action_token_1103|>": 152777,
+  "<|action_token_1104|>": 152778,
+  "<|action_token_1105|>": 152779,
+  "<|action_token_1106|>": 152780,
+  "<|action_token_1107|>": 152781,
+  "<|action_token_1108|>": 152782,
+  "<|action_token_1109|>": 152783,
+  "<|action_token_110|>": 151784,
+  "<|action_token_1110|>": 152784,
+  "<|action_token_1111|>": 152785,
+  "<|action_token_1112|>": 152786,
+  "<|action_token_1113|>": 152787,
+  "<|action_token_1114|>": 152788,
+  "<|action_token_1115|>": 152789,
+  "<|action_token_1116|>": 152790,
+  "<|action_token_1117|>": 152791,
+  "<|action_token_1118|>": 152792,
+  "<|action_token_1119|>": 152793,
+  "<|action_token_111|>": 151785,
+  "<|action_token_1120|>": 152794,
+  "<|action_token_1121|>": 152795,
+  "<|action_token_1122|>": 152796,
+  "<|action_token_1123|>": 152797,
+  "<|action_token_1124|>": 152798,
+  "<|action_token_1125|>": 152799,
+  "<|action_token_1126|>": 152800,
+  "<|action_token_1127|>": 152801,
+  "<|action_token_1128|>": 152802,
+  "<|action_token_1129|>": 152803,
+  "<|action_token_112|>": 151786,
+  "<|action_token_1130|>": 152804,
+  "<|action_token_1131|>": 152805,
+  "<|action_token_1132|>": 152806,
+  "<|action_token_1133|>": 152807,
+  "<|action_token_1134|>": 152808,
+  "<|action_token_1135|>": 152809,
+  "<|action_token_1136|>": 152810,
+  "<|action_token_1137|>": 152811,
+  "<|action_token_1138|>": 152812,
+  "<|action_token_1139|>": 152813,
+  "<|action_token_113|>": 151787,
+  "<|action_token_1140|>": 152814,
+  "<|action_token_1141|>": 152815,
+  "<|action_token_1142|>": 152816,
+  "<|action_token_1143|>": 152817,
+  "<|action_token_1144|>": 152818,
+  "<|action_token_1145|>": 152819,
+  "<|action_token_1146|>": 152820,
+  "<|action_token_1147|>": 152821,
+  "<|action_token_1148|>": 152822,
+  "<|action_token_1149|>": 152823,
+  "<|action_token_114|>": 151788,
+  "<|action_token_1150|>": 152824,
+  "<|action_token_1151|>": 152825,
+  "<|action_token_1152|>": 152826,
+  "<|action_token_1153|>": 152827,
+  "<|action_token_1154|>": 152828,
+  "<|action_token_1155|>": 152829,
+  "<|action_token_1156|>": 152830,
+  "<|action_token_1157|>": 152831,
+  "<|action_token_1158|>": 152832,
+  "<|action_token_1159|>": 152833,
+  "<|action_token_115|>": 151789,
+  "<|action_token_1160|>": 152834,
+  "<|action_token_1161|>": 152835,
+  "<|action_token_1162|>": 152836,
+  "<|action_token_1163|>": 152837,
+  "<|action_token_1164|>": 152838,
+  "<|action_token_1165|>": 152839,
+  "<|action_token_1166|>": 152840,
+  "<|action_token_1167|>": 152841,
+  "<|action_token_1168|>": 152842,
+  "<|action_token_1169|>": 152843,
+  "<|action_token_116|>": 151790,
+  "<|action_token_1170|>": 152844,
+  "<|action_token_1171|>": 152845,
+  "<|action_token_1172|>": 152846,
+  "<|action_token_1173|>": 152847,
+  "<|action_token_1174|>": 152848,
+  "<|action_token_1175|>": 152849,
+  "<|action_token_1176|>": 152850,
+  "<|action_token_1177|>": 152851,
+  "<|action_token_1178|>": 152852,
+  "<|action_token_1179|>": 152853,
+  "<|action_token_117|>": 151791,
+  "<|action_token_1180|>": 152854,
+  "<|action_token_1181|>": 152855,
+  "<|action_token_1182|>": 152856,
+  "<|action_token_1183|>": 152857,
+  "<|action_token_1184|>": 152858,
+  "<|action_token_1185|>": 152859,
+  "<|action_token_1186|>": 152860,
+  "<|action_token_1187|>": 152861,
+  "<|action_token_1188|>": 152862,
+  "<|action_token_1189|>": 152863,
+  "<|action_token_118|>": 151792,
+  "<|action_token_1190|>": 152864,
+  "<|action_token_1191|>": 152865,
+  "<|action_token_1192|>": 152866,
+  "<|action_token_1193|>": 152867,
+  "<|action_token_1194|>": 152868,
+  "<|action_token_1195|>": 152869,
+  "<|action_token_1196|>": 152870,
+  "<|action_token_1197|>": 152871,
+  "<|action_token_1198|>": 152872,
+  "<|action_token_1199|>": 152873,
+  "<|action_token_119|>": 151793,
+  "<|action_token_11|>": 151685,
+  "<|action_token_1200|>": 152874,
+  "<|action_token_1201|>": 152875,
+  "<|action_token_1202|>": 152876,
+  "<|action_token_1203|>": 152877,
+  "<|action_token_1204|>": 152878,
+  "<|action_token_1205|>": 152879,
+  "<|action_token_1206|>": 152880,
+  "<|action_token_1207|>": 152881,
+  "<|action_token_1208|>": 152882,
+  "<|action_token_1209|>": 152883,
+  "<|action_token_120|>": 151794,
+  "<|action_token_1210|>": 152884,
+  "<|action_token_1211|>": 152885,
+  "<|action_token_1212|>": 152886,
+  "<|action_token_1213|>": 152887,
+  "<|action_token_1214|>": 152888,
+  "<|action_token_1215|>": 152889,
+  "<|action_token_1216|>": 152890,
+  "<|action_token_1217|>": 152891,
+  "<|action_token_1218|>": 152892,
+  "<|action_token_1219|>": 152893,
+  "<|action_token_121|>": 151795,
+  "<|action_token_1220|>": 152894,
+  "<|action_token_1221|>": 152895,
+  "<|action_token_1222|>": 152896,
+  "<|action_token_1223|>": 152897,
+  "<|action_token_1224|>": 152898,
+  "<|action_token_1225|>": 152899,
+  "<|action_token_1226|>": 152900,
+  "<|action_token_1227|>": 152901,
+  "<|action_token_1228|>": 152902,
+  "<|action_token_1229|>": 152903,
+  "<|action_token_122|>": 151796,
+  "<|action_token_1230|>": 152904,
+  "<|action_token_1231|>": 152905,
+  "<|action_token_1232|>": 152906,
+  "<|action_token_1233|>": 152907,
+  "<|action_token_1234|>": 152908,
+  "<|action_token_1235|>": 152909,
+  "<|action_token_1236|>": 152910,
+  "<|action_token_1237|>": 152911,
+  "<|action_token_1238|>": 152912,
+  "<|action_token_1239|>": 152913,
+  "<|action_token_123|>": 151797,
+  "<|action_token_1240|>": 152914,
+  "<|action_token_1241|>": 152915,
+  "<|action_token_1242|>": 152916,
+  "<|action_token_1243|>": 152917,
+  "<|action_token_1244|>": 152918,
+  "<|action_token_1245|>": 152919,
+  "<|action_token_1246|>": 152920,
+  "<|action_token_1247|>": 152921,
+  "<|action_token_1248|>": 152922,
+  "<|action_token_1249|>": 152923,
+  "<|action_token_124|>": 151798,
+  "<|action_token_1250|>": 152924,
+  "<|action_token_1251|>": 152925,
+  "<|action_token_1252|>": 152926,
+  "<|action_token_1253|>": 152927,
+  "<|action_token_1254|>": 152928,
+  "<|action_token_1255|>": 152929,
+  "<|action_token_1256|>": 152930,
+  "<|action_token_1257|>": 152931,
+  "<|action_token_1258|>": 152932,
+  "<|action_token_1259|>": 152933,
+  "<|action_token_125|>": 151799,
+  "<|action_token_1260|>": 152934,
+  "<|action_token_1261|>": 152935,
+  "<|action_token_1262|>": 152936,
+  "<|action_token_1263|>": 152937,
+  "<|action_token_1264|>": 152938,
+  "<|action_token_1265|>": 152939,
+  "<|action_token_1266|>": 152940,
+  "<|action_token_1267|>": 152941,
+  "<|action_token_1268|>": 152942,
+  "<|action_token_1269|>": 152943,
+  "<|action_token_126|>": 151800,
+  "<|action_token_1270|>": 152944,
+  "<|action_token_1271|>": 152945,
+  "<|action_token_1272|>": 152946,
+  "<|action_token_1273|>": 152947,
+  "<|action_token_1274|>": 152948,
+  "<|action_token_1275|>": 152949,
+  "<|action_token_1276|>": 152950,
+  "<|action_token_1277|>": 152951,
+  "<|action_token_1278|>": 152952,
+  "<|action_token_1279|>": 152953,
+  "<|action_token_127|>": 151801,
+  "<|action_token_1280|>": 152954,
+  "<|action_token_1281|>": 152955,
+  "<|action_token_1282|>": 152956,
+  "<|action_token_1283|>": 152957,
+  "<|action_token_1284|>": 152958,
+  "<|action_token_1285|>": 152959,
+  "<|action_token_1286|>": 152960,
+  "<|action_token_1287|>": 152961,
+  "<|action_token_1288|>": 152962,
+  "<|action_token_1289|>": 152963,
+  "<|action_token_128|>": 151802,
+  "<|action_token_1290|>": 152964,
+  "<|action_token_1291|>": 152965,
+  "<|action_token_1292|>": 152966,
+  "<|action_token_1293|>": 152967,
+  "<|action_token_1294|>": 152968,
+  "<|action_token_1295|>": 152969,
+  "<|action_token_1296|>": 152970,
+  "<|action_token_1297|>": 152971,
+  "<|action_token_1298|>": 152972,
+  "<|action_token_1299|>": 152973,
+  "<|action_token_129|>": 151803,
+  "<|action_token_12|>": 151686,
+  "<|action_token_1300|>": 152974,
+  "<|action_token_1301|>": 152975,
+  "<|action_token_1302|>": 152976,
+  "<|action_token_1303|>": 152977,
+  "<|action_token_1304|>": 152978,
+  "<|action_token_1305|>": 152979,
+  "<|action_token_1306|>": 152980,
+  "<|action_token_1307|>": 152981,
+  "<|action_token_1308|>": 152982,
+  "<|action_token_1309|>": 152983,
+  "<|action_token_130|>": 151804,
+  "<|action_token_1310|>": 152984,
+  "<|action_token_1311|>": 152985,
+  "<|action_token_1312|>": 152986,
+  "<|action_token_1313|>": 152987,
+  "<|action_token_1314|>": 152988,
+  "<|action_token_1315|>": 152989,
+  "<|action_token_1316|>": 152990,
+  "<|action_token_1317|>": 152991,
+  "<|action_token_1318|>": 152992,
+  "<|action_token_1319|>": 152993,
+  "<|action_token_131|>": 151805,
+  "<|action_token_1320|>": 152994,
+  "<|action_token_1321|>": 152995,
+  "<|action_token_1322|>": 152996,
+  "<|action_token_1323|>": 152997,
+  "<|action_token_1324|>": 152998,
+  "<|action_token_1325|>": 152999,
+  "<|action_token_1326|>": 153000,
+  "<|action_token_1327|>": 153001,
+  "<|action_token_1328|>": 153002,
+  "<|action_token_1329|>": 153003,
+  "<|action_token_132|>": 151806,
+  "<|action_token_1330|>": 153004,
+  "<|action_token_1331|>": 153005,
+  "<|action_token_1332|>": 153006,
+  "<|action_token_1333|>": 153007,
+  "<|action_token_1334|>": 153008,
+  "<|action_token_1335|>": 153009,
+  "<|action_token_1336|>": 153010,
+  "<|action_token_1337|>": 153011,
+  "<|action_token_1338|>": 153012,
+  "<|action_token_1339|>": 153013,
+  "<|action_token_133|>": 151807,
+  "<|action_token_1340|>": 153014,
+  "<|action_token_1341|>": 153015,
+  "<|action_token_1342|>": 153016,
+  "<|action_token_1343|>": 153017,
+  "<|action_token_1344|>": 153018,
+  "<|action_token_1345|>": 153019,
+  "<|action_token_1346|>": 153020,
+  "<|action_token_1347|>": 153021,
+  "<|action_token_1348|>": 153022,
+  "<|action_token_1349|>": 153023,
+  "<|action_token_134|>": 151808,
+  "<|action_token_1350|>": 153024,
+  "<|action_token_1351|>": 153025,
+  "<|action_token_1352|>": 153026,
+  "<|action_token_1353|>": 153027,
+  "<|action_token_1354|>": 153028,
+  "<|action_token_1355|>": 153029,
+  "<|action_token_1356|>": 153030,
+  "<|action_token_1357|>": 153031,
+  "<|action_token_1358|>": 153032,
+  "<|action_token_1359|>": 153033,
+  "<|action_token_135|>": 151809,
+  "<|action_token_1360|>": 153034,
+  "<|action_token_1361|>": 153035,
+  "<|action_token_1362|>": 153036,
+  "<|action_token_1363|>": 153037,
+  "<|action_token_1364|>": 153038,
+  "<|action_token_1365|>": 153039,
+  "<|action_token_1366|>": 153040,
+  "<|action_token_1367|>": 153041,
+  "<|action_token_1368|>": 153042,
+  "<|action_token_1369|>": 153043,
+  "<|action_token_136|>": 151810,
+  "<|action_token_1370|>": 153044,
+  "<|action_token_1371|>": 153045,
+  "<|action_token_1372|>": 153046,
+  "<|action_token_1373|>": 153047,
+  "<|action_token_1374|>": 153048,
+  "<|action_token_1375|>": 153049,
+  "<|action_token_1376|>": 153050,
+  "<|action_token_1377|>": 153051,
+  "<|action_token_1378|>": 153052,
+  "<|action_token_1379|>": 153053,
+  "<|action_token_137|>": 151811,
+  "<|action_token_1380|>": 153054,
+  "<|action_token_1381|>": 153055,
+  "<|action_token_1382|>": 153056,
+  "<|action_token_1383|>": 153057,
+  "<|action_token_1384|>": 153058,
+  "<|action_token_1385|>": 153059,
+  "<|action_token_1386|>": 153060,
+  "<|action_token_1387|>": 153061,
+  "<|action_token_1388|>": 153062,
+  "<|action_token_1389|>": 153063,
+  "<|action_token_138|>": 151812,
+  "<|action_token_1390|>": 153064,
+  "<|action_token_1391|>": 153065,
+  "<|action_token_1392|>": 153066,
+  "<|action_token_1393|>": 153067,
+  "<|action_token_1394|>": 153068,
+  "<|action_token_1395|>": 153069,
+  "<|action_token_1396|>": 153070,
+  "<|action_token_1397|>": 153071,
+  "<|action_token_1398|>": 153072,
+  "<|action_token_1399|>": 153073,
+  "<|action_token_139|>": 151813,
+  "<|action_token_13|>": 151687,
+  "<|action_token_1400|>": 153074,
+  "<|action_token_1401|>": 153075,
+  "<|action_token_1402|>": 153076,
+  "<|action_token_1403|>": 153077,
+  "<|action_token_1404|>": 153078,
+  "<|action_token_1405|>": 153079,
+  "<|action_token_1406|>": 153080,
+  "<|action_token_1407|>": 153081,
+  "<|action_token_1408|>": 153082,
+  "<|action_token_1409|>": 153083,
+  "<|action_token_140|>": 151814,
+  "<|action_token_1410|>": 153084,
+  "<|action_token_1411|>": 153085,
+  "<|action_token_1412|>": 153086,
+  "<|action_token_1413|>": 153087,
+  "<|action_token_1414|>": 153088,
+  "<|action_token_1415|>": 153089,
+  "<|action_token_1416|>": 153090,
+  "<|action_token_1417|>": 153091,
+  "<|action_token_1418|>": 153092,
+  "<|action_token_1419|>": 153093,
+  "<|action_token_141|>": 151815,
+  "<|action_token_1420|>": 153094,
+  "<|action_token_1421|>": 153095,
+  "<|action_token_1422|>": 153096,
+  "<|action_token_1423|>": 153097,
+  "<|action_token_1424|>": 153098,
+  "<|action_token_1425|>": 153099,
+  "<|action_token_1426|>": 153100,
+  "<|action_token_1427|>": 153101,
+  "<|action_token_1428|>": 153102,
+  "<|action_token_1429|>": 153103,
+  "<|action_token_142|>": 151816,
+  "<|action_token_1430|>": 153104,
+  "<|action_token_1431|>": 153105,
+  "<|action_token_1432|>": 153106,
+  "<|action_token_1433|>": 153107,
+  "<|action_token_1434|>": 153108,
+  "<|action_token_1435|>": 153109,
+  "<|action_token_1436|>": 153110,
+  "<|action_token_1437|>": 153111,
+  "<|action_token_1438|>": 153112,
+  "<|action_token_1439|>": 153113,
+  "<|action_token_143|>": 151817,
+  "<|action_token_1440|>": 153114,
+  "<|action_token_1441|>": 153115,
+  "<|action_token_1442|>": 153116,
+  "<|action_token_1443|>": 153117,
+  "<|action_token_1444|>": 153118,
+  "<|action_token_1445|>": 153119,
+  "<|action_token_1446|>": 153120,
+  "<|action_token_1447|>": 153121,
+  "<|action_token_1448|>": 153122,
+  "<|action_token_1449|>": 153123,
+  "<|action_token_144|>": 151818,
+  "<|action_token_1450|>": 153124,
+  "<|action_token_1451|>": 153125,
+  "<|action_token_1452|>": 153126,
+  "<|action_token_1453|>": 153127,
+  "<|action_token_1454|>": 153128,
+  "<|action_token_1455|>": 153129,
+  "<|action_token_1456|>": 153130,
+  "<|action_token_1457|>": 153131,
+  "<|action_token_1458|>": 153132,
+  "<|action_token_1459|>": 153133,
+  "<|action_token_145|>": 151819,
+  "<|action_token_1460|>": 153134,
+  "<|action_token_1461|>": 153135,
+  "<|action_token_1462|>": 153136,
+  "<|action_token_1463|>": 153137,
+  "<|action_token_1464|>": 153138,
+  "<|action_token_1465|>": 153139,
+  "<|action_token_1466|>": 153140,
+  "<|action_token_1467|>": 153141,
+  "<|action_token_1468|>": 153142,
+  "<|action_token_1469|>": 153143,
+  "<|action_token_146|>": 151820,
+  "<|action_token_1470|>": 153144,
+  "<|action_token_1471|>": 153145,
+  "<|action_token_1472|>": 153146,
+  "<|action_token_1473|>": 153147,
+  "<|action_token_1474|>": 153148,
+  "<|action_token_1475|>": 153149,
+  "<|action_token_1476|>": 153150,
+  "<|action_token_1477|>": 153151,
+  "<|action_token_1478|>": 153152,
+  "<|action_token_1479|>": 153153,
+  "<|action_token_147|>": 151821,
+  "<|action_token_1480|>": 153154,
+  "<|action_token_1481|>": 153155,
+  "<|action_token_1482|>": 153156,
+  "<|action_token_1483|>": 153157,
+  "<|action_token_1484|>": 153158,
+  "<|action_token_1485|>": 153159,
+  "<|action_token_1486|>": 153160,
+  "<|action_token_1487|>": 153161,
+  "<|action_token_1488|>": 153162,
+  "<|action_token_1489|>": 153163,
+  "<|action_token_148|>": 151822,
+  "<|action_token_1490|>": 153164,
+  "<|action_token_1491|>": 153165,
+  "<|action_token_1492|>": 153166,
+  "<|action_token_1493|>": 153167,
+  "<|action_token_1494|>": 153168,
+  "<|action_token_1495|>": 153169,
+  "<|action_token_1496|>": 153170,
+  "<|action_token_1497|>": 153171,
+  "<|action_token_1498|>": 153172,
+  "<|action_token_1499|>": 153173,
+  "<|action_token_149|>": 151823,
+  "<|action_token_14|>": 151688,
+  "<|action_token_1500|>": 153174,
+  "<|action_token_1501|>": 153175,
+  "<|action_token_1502|>": 153176,
+  "<|action_token_1503|>": 153177,
+  "<|action_token_1504|>": 153178,
+  "<|action_token_1505|>": 153179,
+  "<|action_token_1506|>": 153180,
+  "<|action_token_1507|>": 153181,
+  "<|action_token_1508|>": 153182,
+  "<|action_token_1509|>": 153183,
+  "<|action_token_150|>": 151824,
+  "<|action_token_1510|>": 153184,
+  "<|action_token_1511|>": 153185,
+  "<|action_token_1512|>": 153186,
+  "<|action_token_1513|>": 153187,
+  "<|action_token_1514|>": 153188,
+  "<|action_token_1515|>": 153189,
+  "<|action_token_1516|>": 153190,
+  "<|action_token_1517|>": 153191,
+  "<|action_token_1518|>": 153192,
+  "<|action_token_1519|>": 153193,
+  "<|action_token_151|>": 151825,
+  "<|action_token_1520|>": 153194,
+  "<|action_token_1521|>": 153195,
+  "<|action_token_1522|>": 153196,
+  "<|action_token_1523|>": 153197,
+  "<|action_token_1524|>": 153198,
+  "<|action_token_1525|>": 153199,
+  "<|action_token_1526|>": 153200,
+  "<|action_token_1527|>": 153201,
+  "<|action_token_1528|>": 153202,
+  "<|action_token_1529|>": 153203,
+  "<|action_token_152|>": 151826,
+  "<|action_token_1530|>": 153204,
+  "<|action_token_1531|>": 153205,
+  "<|action_token_1532|>": 153206,
+  "<|action_token_1533|>": 153207,
+  "<|action_token_1534|>": 153208,
+  "<|action_token_1535|>": 153209,
+  "<|action_token_1536|>": 153210,
+  "<|action_token_1537|>": 153211,
+  "<|action_token_1538|>": 153212,
+  "<|action_token_1539|>": 153213,
+  "<|action_token_153|>": 151827,
+  "<|action_token_1540|>": 153214,
+  "<|action_token_1541|>": 153215,
+  "<|action_token_1542|>": 153216,
+  "<|action_token_1543|>": 153217,
+  "<|action_token_1544|>": 153218,
+  "<|action_token_1545|>": 153219,
+  "<|action_token_1546|>": 153220,
+  "<|action_token_1547|>": 153221,
+  "<|action_token_1548|>": 153222,
+  "<|action_token_1549|>": 153223,
+  "<|action_token_154|>": 151828,
+  "<|action_token_1550|>": 153224,
+  "<|action_token_1551|>": 153225,
+  "<|action_token_1552|>": 153226,
+  "<|action_token_1553|>": 153227,
+  "<|action_token_1554|>": 153228,
+  "<|action_token_1555|>": 153229,
+  "<|action_token_1556|>": 153230,
+  "<|action_token_1557|>": 153231,
+  "<|action_token_1558|>": 153232,
+  "<|action_token_1559|>": 153233,
+  "<|action_token_155|>": 151829,
+  "<|action_token_1560|>": 153234,
+  "<|action_token_1561|>": 153235,
+  "<|action_token_1562|>": 153236,
+  "<|action_token_1563|>": 153237,
+  "<|action_token_1564|>": 153238,
+  "<|action_token_1565|>": 153239,
+  "<|action_token_1566|>": 153240,
+  "<|action_token_1567|>": 153241,
+  "<|action_token_1568|>": 153242,
+  "<|action_token_1569|>": 153243,
+  "<|action_token_156|>": 151830,
+  "<|action_token_1570|>": 153244,
+  "<|action_token_1571|>": 153245,
+  "<|action_token_1572|>": 153246,
+  "<|action_token_1573|>": 153247,
+  "<|action_token_1574|>": 153248,
+  "<|action_token_1575|>": 153249,
+  "<|action_token_1576|>": 153250,
+  "<|action_token_1577|>": 153251,
+  "<|action_token_1578|>": 153252,
+  "<|action_token_1579|>": 153253,
+  "<|action_token_157|>": 151831,
+  "<|action_token_1580|>": 153254,
+  "<|action_token_1581|>": 153255,
+  "<|action_token_1582|>": 153256,
+  "<|action_token_1583|>": 153257,
+  "<|action_token_1584|>": 153258,
+  "<|action_token_1585|>": 153259,
+  "<|action_token_1586|>": 153260,
+  "<|action_token_1587|>": 153261,
+  "<|action_token_1588|>": 153262,
+  "<|action_token_1589|>": 153263,
+  "<|action_token_158|>": 151832,
+  "<|action_token_1590|>": 153264,
+  "<|action_token_1591|>": 153265,
+  "<|action_token_1592|>": 153266,
+  "<|action_token_1593|>": 153267,
+  "<|action_token_1594|>": 153268,
+  "<|action_token_1595|>": 153269,
+  "<|action_token_1596|>": 153270,
+  "<|action_token_1597|>": 153271,
+  "<|action_token_1598|>": 153272,
+  "<|action_token_1599|>": 153273,
+  "<|action_token_159|>": 151833,
+  "<|action_token_15|>": 151689,
+  "<|action_token_1600|>": 153274,
+  "<|action_token_1601|>": 153275,
+  "<|action_token_1602|>": 153276,
+  "<|action_token_1603|>": 153277,
+  "<|action_token_1604|>": 153278,
+  "<|action_token_1605|>": 153279,
+  "<|action_token_1606|>": 153280,
+  "<|action_token_1607|>": 153281,
+  "<|action_token_1608|>": 153282,
+  "<|action_token_1609|>": 153283,
+  "<|action_token_160|>": 151834,
+  "<|action_token_1610|>": 153284,
+  "<|action_token_1611|>": 153285,
+  "<|action_token_1612|>": 153286,
+  "<|action_token_1613|>": 153287,
+  "<|action_token_1614|>": 153288,
+  "<|action_token_1615|>": 153289,
+  "<|action_token_1616|>": 153290,
+  "<|action_token_1617|>": 153291,
+  "<|action_token_1618|>": 153292,
+  "<|action_token_1619|>": 153293,
+  "<|action_token_161|>": 151835,
+  "<|action_token_1620|>": 153294,
+  "<|action_token_1621|>": 153295,
+  "<|action_token_1622|>": 153296,
+  "<|action_token_1623|>": 153297,
+  "<|action_token_1624|>": 153298,
+  "<|action_token_1625|>": 153299,
+  "<|action_token_1626|>": 153300,
+  "<|action_token_1627|>": 153301,
+  "<|action_token_1628|>": 153302,
+  "<|action_token_1629|>": 153303,
+  "<|action_token_162|>": 151836,
+  "<|action_token_1630|>": 153304,
+  "<|action_token_1631|>": 153305,
+  "<|action_token_1632|>": 153306,
+  "<|action_token_1633|>": 153307,
+  "<|action_token_1634|>": 153308,
+  "<|action_token_1635|>": 153309,
+  "<|action_token_1636|>": 153310,
+  "<|action_token_1637|>": 153311,
+  "<|action_token_1638|>": 153312,
+  "<|action_token_1639|>": 153313,
+  "<|action_token_163|>": 151837,
+  "<|action_token_1640|>": 153314,
+  "<|action_token_1641|>": 153315,
+  "<|action_token_1642|>": 153316,
+  "<|action_token_1643|>": 153317,
+  "<|action_token_1644|>": 153318,
+  "<|action_token_1645|>": 153319,
+  "<|action_token_1646|>": 153320,
+  "<|action_token_1647|>": 153321,
+  "<|action_token_1648|>": 153322,
+  "<|action_token_1649|>": 153323,
+  "<|action_token_164|>": 151838,
+  "<|action_token_1650|>": 153324,
+  "<|action_token_1651|>": 153325,
+  "<|action_token_1652|>": 153326,
+  "<|action_token_1653|>": 153327,
+  "<|action_token_1654|>": 153328,
+  "<|action_token_1655|>": 153329,
+  "<|action_token_1656|>": 153330,
+  "<|action_token_1657|>": 153331,
+  "<|action_token_1658|>": 153332,
+  "<|action_token_1659|>": 153333,
+  "<|action_token_165|>": 151839,
+  "<|action_token_1660|>": 153334,
+  "<|action_token_1661|>": 153335,
+  "<|action_token_1662|>": 153336,
+  "<|action_token_1663|>": 153337,
+  "<|action_token_1664|>": 153338,
+  "<|action_token_1665|>": 153339,
+  "<|action_token_1666|>": 153340,
+  "<|action_token_1667|>": 153341,
+  "<|action_token_1668|>": 153342,
+  "<|action_token_1669|>": 153343,
+  "<|action_token_166|>": 151840,
+  "<|action_token_1670|>": 153344,
+  "<|action_token_1671|>": 153345,
+  "<|action_token_1672|>": 153346,
+  "<|action_token_1673|>": 153347,
+  "<|action_token_1674|>": 153348,
+  "<|action_token_1675|>": 153349,
+  "<|action_token_1676|>": 153350,
+  "<|action_token_1677|>": 153351,
+  "<|action_token_1678|>": 153352,
+  "<|action_token_1679|>": 153353,
+  "<|action_token_167|>": 151841,
+  "<|action_token_1680|>": 153354,
+  "<|action_token_1681|>": 153355,
+  "<|action_token_1682|>": 153356,
+  "<|action_token_1683|>": 153357,
+  "<|action_token_1684|>": 153358,
+  "<|action_token_1685|>": 153359,
+  "<|action_token_1686|>": 153360,
+  "<|action_token_1687|>": 153361,
+  "<|action_token_1688|>": 153362,
+  "<|action_token_1689|>": 153363,
+  "<|action_token_168|>": 151842,
+  "<|action_token_1690|>": 153364,
+  "<|action_token_1691|>": 153365,
+  "<|action_token_1692|>": 153366,
+  "<|action_token_1693|>": 153367,
+  "<|action_token_1694|>": 153368,
+  "<|action_token_1695|>": 153369,
+  "<|action_token_1696|>": 153370,
+  "<|action_token_1697|>": 153371,
+  "<|action_token_1698|>": 153372,
+  "<|action_token_1699|>": 153373,
+  "<|action_token_169|>": 151843,
+  "<|action_token_16|>": 151690,
+  "<|action_token_1700|>": 153374,
+  "<|action_token_1701|>": 153375,
+  "<|action_token_1702|>": 153376,
+  "<|action_token_1703|>": 153377,
+  "<|action_token_1704|>": 153378,
+  "<|action_token_1705|>": 153379,
+  "<|action_token_1706|>": 153380,
+  "<|action_token_1707|>": 153381,
+  "<|action_token_1708|>": 153382,
+  "<|action_token_1709|>": 153383,
+  "<|action_token_170|>": 151844,
+  "<|action_token_1710|>": 153384,
+  "<|action_token_1711|>": 153385,
+  "<|action_token_1712|>": 153386,
+  "<|action_token_1713|>": 153387,
+  "<|action_token_1714|>": 153388,
+  "<|action_token_1715|>": 153389,
+  "<|action_token_1716|>": 153390,
+  "<|action_token_1717|>": 153391,
+  "<|action_token_1718|>": 153392,
+  "<|action_token_1719|>": 153393,
+  "<|action_token_171|>": 151845,
+  "<|action_token_1720|>": 153394,
+  "<|action_token_1721|>": 153395,
+  "<|action_token_1722|>": 153396,
+  "<|action_token_1723|>": 153397,
+  "<|action_token_1724|>": 153398,
+  "<|action_token_1725|>": 153399,
+  "<|action_token_1726|>": 153400,
+  "<|action_token_1727|>": 153401,
+  "<|action_token_1728|>": 153402,
+  "<|action_token_1729|>": 153403,
+  "<|action_token_172|>": 151846,
+  "<|action_token_1730|>": 153404,
+  "<|action_token_1731|>": 153405,
+  "<|action_token_1732|>": 153406,
+  "<|action_token_1733|>": 153407,
+  "<|action_token_1734|>": 153408,
+  "<|action_token_1735|>": 153409,
+  "<|action_token_1736|>": 153410,
+  "<|action_token_1737|>": 153411,
+  "<|action_token_1738|>": 153412,
+  "<|action_token_1739|>": 153413,
+  "<|action_token_173|>": 151847,
+  "<|action_token_1740|>": 153414,
+  "<|action_token_1741|>": 153415,
+  "<|action_token_1742|>": 153416,
+  "<|action_token_1743|>": 153417,
+  "<|action_token_1744|>": 153418,
+  "<|action_token_1745|>": 153419,
+  "<|action_token_1746|>": 153420,
+  "<|action_token_1747|>": 153421,
+  "<|action_token_1748|>": 153422,
+  "<|action_token_1749|>": 153423,
+  "<|action_token_174|>": 151848,
+  "<|action_token_1750|>": 153424,
+  "<|action_token_1751|>": 153425,
+  "<|action_token_1752|>": 153426,
+  "<|action_token_1753|>": 153427,
+  "<|action_token_1754|>": 153428,
+  "<|action_token_1755|>": 153429,
+  "<|action_token_1756|>": 153430,
+  "<|action_token_1757|>": 153431,
+  "<|action_token_1758|>": 153432,
+  "<|action_token_1759|>": 153433,
+  "<|action_token_175|>": 151849,
+  "<|action_token_1760|>": 153434,
+  "<|action_token_1761|>": 153435,
+  "<|action_token_1762|>": 153436,
+  "<|action_token_1763|>": 153437,
+  "<|action_token_1764|>": 153438,
+  "<|action_token_1765|>": 153439,
+  "<|action_token_1766|>": 153440,
+  "<|action_token_1767|>": 153441,
+  "<|action_token_1768|>": 153442,
+  "<|action_token_1769|>": 153443,
+  "<|action_token_176|>": 151850,
+  "<|action_token_1770|>": 153444,
+  "<|action_token_1771|>": 153445,
+  "<|action_token_1772|>": 153446,
+  "<|action_token_1773|>": 153447,
+  "<|action_token_1774|>": 153448,
+  "<|action_token_1775|>": 153449,
+  "<|action_token_1776|>": 153450,
+  "<|action_token_1777|>": 153451,
+  "<|action_token_1778|>": 153452,
+  "<|action_token_1779|>": 153453,
+  "<|action_token_177|>": 151851,
+  "<|action_token_1780|>": 153454,
+  "<|action_token_1781|>": 153455,
+  "<|action_token_1782|>": 153456,
+  "<|action_token_1783|>": 153457,
+  "<|action_token_1784|>": 153458,
+  "<|action_token_1785|>": 153459,
+  "<|action_token_1786|>": 153460,
+  "<|action_token_1787|>": 153461,
+  "<|action_token_1788|>": 153462,
+  "<|action_token_1789|>": 153463,
+  "<|action_token_178|>": 151852,
+  "<|action_token_1790|>": 153464,
+  "<|action_token_1791|>": 153465,
+  "<|action_token_1792|>": 153466,
+  "<|action_token_1793|>": 153467,
+  "<|action_token_1794|>": 153468,
+  "<|action_token_1795|>": 153469,
+  "<|action_token_1796|>": 153470,
+  "<|action_token_1797|>": 153471,
+  "<|action_token_1798|>": 153472,
+  "<|action_token_1799|>": 153473,
+  "<|action_token_179|>": 151853,
+  "<|action_token_17|>": 151691,
+  "<|action_token_1800|>": 153474,
+  "<|action_token_1801|>": 153475,
+  "<|action_token_1802|>": 153476,
+  "<|action_token_1803|>": 153477,
+  "<|action_token_1804|>": 153478,
+  "<|action_token_1805|>": 153479,
+  "<|action_token_1806|>": 153480,
+  "<|action_token_1807|>": 153481,
+  "<|action_token_1808|>": 153482,
+  "<|action_token_1809|>": 153483,
+  "<|action_token_180|>": 151854,
+  "<|action_token_1810|>": 153484,
+  "<|action_token_1811|>": 153485,
+  "<|action_token_1812|>": 153486,
+  "<|action_token_1813|>": 153487,
+  "<|action_token_1814|>": 153488,
+  "<|action_token_1815|>": 153489,
+  "<|action_token_1816|>": 153490,
+  "<|action_token_1817|>": 153491,
+  "<|action_token_1818|>": 153492,
+  "<|action_token_1819|>": 153493,
+  "<|action_token_181|>": 151855,
+  "<|action_token_1820|>": 153494,
+  "<|action_token_1821|>": 153495,
+  "<|action_token_1822|>": 153496,
+  "<|action_token_1823|>": 153497,
+  "<|action_token_1824|>": 153498,
+  "<|action_token_1825|>": 153499,
+  "<|action_token_1826|>": 153500,
+  "<|action_token_1827|>": 153501,
+  "<|action_token_1828|>": 153502,
+  "<|action_token_1829|>": 153503,
+  "<|action_token_182|>": 151856,
+  "<|action_token_1830|>": 153504,
+  "<|action_token_1831|>": 153505,
+  "<|action_token_1832|>": 153506,
+  "<|action_token_1833|>": 153507,
+  "<|action_token_1834|>": 153508,
+  "<|action_token_1835|>": 153509,
+  "<|action_token_1836|>": 153510,
+  "<|action_token_1837|>": 153511,
+  "<|action_token_1838|>": 153512,
+  "<|action_token_1839|>": 153513,
+  "<|action_token_183|>": 151857,
+  "<|action_token_1840|>": 153514,
+  "<|action_token_1841|>": 153515,
+  "<|action_token_1842|>": 153516,
+  "<|action_token_1843|>": 153517,
+  "<|action_token_1844|>": 153518,
+  "<|action_token_1845|>": 153519,
+  "<|action_token_1846|>": 153520,
+  "<|action_token_1847|>": 153521,
+  "<|action_token_1848|>": 153522,
+  "<|action_token_1849|>": 153523,
+  "<|action_token_184|>": 151858,
+  "<|action_token_1850|>": 153524,
+  "<|action_token_1851|>": 153525,
+  "<|action_token_1852|>": 153526,
+  "<|action_token_1853|>": 153527,
+  "<|action_token_1854|>": 153528,
+  "<|action_token_1855|>": 153529,
+  "<|action_token_1856|>": 153530,
+  "<|action_token_1857|>": 153531,
+  "<|action_token_1858|>": 153532,
+  "<|action_token_1859|>": 153533,
+  "<|action_token_185|>": 151859,
+  "<|action_token_1860|>": 153534,
+  "<|action_token_1861|>": 153535,
+  "<|action_token_1862|>": 153536,
+  "<|action_token_1863|>": 153537,
+  "<|action_token_1864|>": 153538,
+  "<|action_token_1865|>": 153539,
+  "<|action_token_1866|>": 153540,
+  "<|action_token_1867|>": 153541,
+  "<|action_token_1868|>": 153542,
+  "<|action_token_1869|>": 153543,
+  "<|action_token_186|>": 151860,
+  "<|action_token_1870|>": 153544,
+  "<|action_token_1871|>": 153545,
+  "<|action_token_1872|>": 153546,
+  "<|action_token_1873|>": 153547,
+  "<|action_token_1874|>": 153548,
+  "<|action_token_1875|>": 153549,
+  "<|action_token_1876|>": 153550,
+  "<|action_token_1877|>": 153551,
+  "<|action_token_1878|>": 153552,
+  "<|action_token_1879|>": 153553,
+  "<|action_token_187|>": 151861,
+  "<|action_token_1880|>": 153554,
+  "<|action_token_1881|>": 153555,
+  "<|action_token_1882|>": 153556,
+  "<|action_token_1883|>": 153557,
+  "<|action_token_1884|>": 153558,
+  "<|action_token_1885|>": 153559,
+  "<|action_token_1886|>": 153560,
+  "<|action_token_1887|>": 153561,
+  "<|action_token_1888|>": 153562,
+  "<|action_token_1889|>": 153563,
+  "<|action_token_188|>": 151862,
+  "<|action_token_1890|>": 153564,
+  "<|action_token_1891|>": 153565,
+  "<|action_token_1892|>": 153566,
+  "<|action_token_1893|>": 153567,
+  "<|action_token_1894|>": 153568,
+  "<|action_token_1895|>": 153569,
+  "<|action_token_1896|>": 153570,
+  "<|action_token_1897|>": 153571,
+  "<|action_token_1898|>": 153572,
+  "<|action_token_1899|>": 153573,
+  "<|action_token_189|>": 151863,
+  "<|action_token_18|>": 151692,
+  "<|action_token_1900|>": 153574,
+  "<|action_token_1901|>": 153575,
+  "<|action_token_1902|>": 153576,
+  "<|action_token_1903|>": 153577,
+  "<|action_token_1904|>": 153578,
+  "<|action_token_1905|>": 153579,
+  "<|action_token_1906|>": 153580,
+  "<|action_token_1907|>": 153581,
+  "<|action_token_1908|>": 153582,
+  "<|action_token_1909|>": 153583,
+  "<|action_token_190|>": 151864,
+  "<|action_token_1910|>": 153584,
+  "<|action_token_1911|>": 153585,
+  "<|action_token_1912|>": 153586,
+  "<|action_token_1913|>": 153587,
+  "<|action_token_1914|>": 153588,
+  "<|action_token_1915|>": 153589,
+  "<|action_token_1916|>": 153590,
+  "<|action_token_1917|>": 153591,
+  "<|action_token_1918|>": 153592,
+  "<|action_token_1919|>": 153593,
+  "<|action_token_191|>": 151865,
+  "<|action_token_1920|>": 153594,
+  "<|action_token_1921|>": 153595,
+  "<|action_token_1922|>": 153596,
+  "<|action_token_1923|>": 153597,
+  "<|action_token_1924|>": 153598,
+  "<|action_token_1925|>": 153599,
+  "<|action_token_1926|>": 153600,
+  "<|action_token_1927|>": 153601,
+  "<|action_token_1928|>": 153602,
+  "<|action_token_1929|>": 153603,
+  "<|action_token_192|>": 151866,
+  "<|action_token_1930|>": 153604,
+  "<|action_token_1931|>": 153605,
+  "<|action_token_1932|>": 153606,
+  "<|action_token_1933|>": 153607,
+  "<|action_token_1934|>": 153608,
+  "<|action_token_1935|>": 153609,
+  "<|action_token_1936|>": 153610,
+  "<|action_token_1937|>": 153611,
+  "<|action_token_1938|>": 153612,
+  "<|action_token_1939|>": 153613,
+  "<|action_token_193|>": 151867,
+  "<|action_token_1940|>": 153614,
+  "<|action_token_1941|>": 153615,
+  "<|action_token_1942|>": 153616,
+  "<|action_token_1943|>": 153617,
+  "<|action_token_1944|>": 153618,
+  "<|action_token_1945|>": 153619,
+  "<|action_token_1946|>": 153620,
+  "<|action_token_1947|>": 153621,
+  "<|action_token_1948|>": 153622,
+  "<|action_token_1949|>": 153623,
+  "<|action_token_194|>": 151868,
+  "<|action_token_1950|>": 153624,
+  "<|action_token_1951|>": 153625,
+  "<|action_token_1952|>": 153626,
+  "<|action_token_1953|>": 153627,
+  "<|action_token_1954|>": 153628,
+  "<|action_token_1955|>": 153629,
+  "<|action_token_1956|>": 153630,
+  "<|action_token_1957|>": 153631,
+  "<|action_token_1958|>": 153632,
+  "<|action_token_1959|>": 153633,
+  "<|action_token_195|>": 151869,
+  "<|action_token_1960|>": 153634,
+  "<|action_token_1961|>": 153635,
+  "<|action_token_1962|>": 153636,
+  "<|action_token_1963|>": 153637,
+  "<|action_token_1964|>": 153638,
+  "<|action_token_1965|>": 153639,
+  "<|action_token_1966|>": 153640,
+  "<|action_token_1967|>": 153641,
+  "<|action_token_1968|>": 153642,
+  "<|action_token_1969|>": 153643,
+  "<|action_token_196|>": 151870,
+  "<|action_token_1970|>": 153644,
+  "<|action_token_1971|>": 153645,
+  "<|action_token_1972|>": 153646,
+  "<|action_token_1973|>": 153647,
+  "<|action_token_1974|>": 153648,
+  "<|action_token_1975|>": 153649,
+  "<|action_token_1976|>": 153650,
+  "<|action_token_1977|>": 153651,
+  "<|action_token_1978|>": 153652,
+  "<|action_token_1979|>": 153653,
+  "<|action_token_197|>": 151871,
+  "<|action_token_1980|>": 153654,
+  "<|action_token_1981|>": 153655,
+  "<|action_token_1982|>": 153656,
+  "<|action_token_1983|>": 153657,
+  "<|action_token_1984|>": 153658,
+  "<|action_token_1985|>": 153659,
+  "<|action_token_1986|>": 153660,
+  "<|action_token_1987|>": 153661,
+  "<|action_token_1988|>": 153662,
+  "<|action_token_1989|>": 153663,
+  "<|action_token_198|>": 151872,
+  "<|action_token_1990|>": 153664,
+  "<|action_token_1991|>": 153665,
+  "<|action_token_1992|>": 153666,
+  "<|action_token_1993|>": 153667,
+  "<|action_token_1994|>": 153668,
+  "<|action_token_1995|>": 153669,
+  "<|action_token_1996|>": 153670,
+  "<|action_token_1997|>": 153671,
+  "<|action_token_1998|>": 153672,
+  "<|action_token_1999|>": 153673,
+  "<|action_token_199|>": 151873,
+  "<|action_token_19|>": 151693,
+  "<|action_token_1|>": 151675,
+  "<|action_token_2000|>": 153674,
+  "<|action_token_2001|>": 153675,
+  "<|action_token_2002|>": 153676,
+  "<|action_token_2003|>": 153677,
+  "<|action_token_2004|>": 153678,
+  "<|action_token_2005|>": 153679,
+  "<|action_token_2006|>": 153680,
+  "<|action_token_2007|>": 153681,
+  "<|action_token_2008|>": 153682,
+  "<|action_token_2009|>": 153683,
+  "<|action_token_200|>": 151874,
+  "<|action_token_2010|>": 153684,
+  "<|action_token_2011|>": 153685,
+  "<|action_token_2012|>": 153686,
+  "<|action_token_2013|>": 153687,
+  "<|action_token_2014|>": 153688,
+  "<|action_token_2015|>": 153689,
+  "<|action_token_2016|>": 153690,
+  "<|action_token_2017|>": 153691,
+  "<|action_token_2018|>": 153692,
+  "<|action_token_2019|>": 153693,
+  "<|action_token_201|>": 151875,
+  "<|action_token_2020|>": 153694,
+  "<|action_token_2021|>": 153695,
+  "<|action_token_2022|>": 153696,
+  "<|action_token_2023|>": 153697,
+  "<|action_token_2024|>": 153698,
+  "<|action_token_2025|>": 153699,
+  "<|action_token_2026|>": 153700,
+  "<|action_token_2027|>": 153701,
+  "<|action_token_2028|>": 153702,
+  "<|action_token_2029|>": 153703,
+  "<|action_token_202|>": 151876,
+  "<|action_token_2030|>": 153704,
+  "<|action_token_2031|>": 153705,
+  "<|action_token_2032|>": 153706,
+  "<|action_token_2033|>": 153707,
+  "<|action_token_2034|>": 153708,
+  "<|action_token_2035|>": 153709,
+  "<|action_token_2036|>": 153710,
+  "<|action_token_2037|>": 153711,
+  "<|action_token_2038|>": 153712,
+  "<|action_token_2039|>": 153713,
+  "<|action_token_203|>": 151877,
+  "<|action_token_2040|>": 153714,
+  "<|action_token_2041|>": 153715,
+  "<|action_token_2042|>": 153716,
+  "<|action_token_2043|>": 153717,
+  "<|action_token_2044|>": 153718,
+  "<|action_token_2045|>": 153719,
+  "<|action_token_2046|>": 153720,
+  "<|action_token_2047|>": 153721,
+  "<|action_token_204|>": 151878,
+  "<|action_token_205|>": 151879,
+  "<|action_token_206|>": 151880,
+  "<|action_token_207|>": 151881,
+  "<|action_token_208|>": 151882,
+  "<|action_token_209|>": 151883,
+  "<|action_token_20|>": 151694,
+  "<|action_token_210|>": 151884,
+  "<|action_token_211|>": 151885,
+  "<|action_token_212|>": 151886,
+  "<|action_token_213|>": 151887,
+  "<|action_token_214|>": 151888,
+  "<|action_token_215|>": 151889,
+  "<|action_token_216|>": 151890,
+  "<|action_token_217|>": 151891,
+  "<|action_token_218|>": 151892,
+  "<|action_token_219|>": 151893,
+  "<|action_token_21|>": 151695,
+  "<|action_token_220|>": 151894,
+  "<|action_token_221|>": 151895,
+  "<|action_token_222|>": 151896,
+  "<|action_token_223|>": 151897,
+  "<|action_token_224|>": 151898,
+  "<|action_token_225|>": 151899,
+  "<|action_token_226|>": 151900,
+  "<|action_token_227|>": 151901,
+  "<|action_token_228|>": 151902,
+  "<|action_token_229|>": 151903,
+  "<|action_token_22|>": 151696,
+  "<|action_token_230|>": 151904,
+  "<|action_token_231|>": 151905,
+  "<|action_token_232|>": 151906,
+  "<|action_token_233|>": 151907,
+  "<|action_token_234|>": 151908,
+  "<|action_token_235|>": 151909,
+  "<|action_token_236|>": 151910,
+  "<|action_token_237|>": 151911,
+  "<|action_token_238|>": 151912,
+  "<|action_token_239|>": 151913,
+  "<|action_token_23|>": 151697,
+  "<|action_token_240|>": 151914,
+  "<|action_token_241|>": 151915,
+  "<|action_token_242|>": 151916,
+  "<|action_token_243|>": 151917,
+  "<|action_token_244|>": 151918,
+  "<|action_token_245|>": 151919,
+  "<|action_token_246|>": 151920,
+  "<|action_token_247|>": 151921,
+  "<|action_token_248|>": 151922,
+  "<|action_token_249|>": 151923,
+  "<|action_token_24|>": 151698,
+  "<|action_token_250|>": 151924,
+  "<|action_token_251|>": 151925,
+  "<|action_token_252|>": 151926,
+  "<|action_token_253|>": 151927,
+  "<|action_token_254|>": 151928,
+  "<|action_token_255|>": 151929,
+  "<|action_token_256|>": 151930,
+  "<|action_token_257|>": 151931,
+  "<|action_token_258|>": 151932,
+  "<|action_token_259|>": 151933,
+  "<|action_token_25|>": 151699,
+  "<|action_token_260|>": 151934,
+  "<|action_token_261|>": 151935,
+  "<|action_token_262|>": 151936,
+  "<|action_token_263|>": 151937,
+  "<|action_token_264|>": 151938,
+  "<|action_token_265|>": 151939,
+  "<|action_token_266|>": 151940,
+  "<|action_token_267|>": 151941,
+  "<|action_token_268|>": 151942,
+  "<|action_token_269|>": 151943,
+  "<|action_token_26|>": 151700,
+  "<|action_token_270|>": 151944,
+  "<|action_token_271|>": 151945,
+  "<|action_token_272|>": 151946,
+  "<|action_token_273|>": 151947,
+  "<|action_token_274|>": 151948,
+  "<|action_token_275|>": 151949,
+  "<|action_token_276|>": 151950,
+  "<|action_token_277|>": 151951,
+  "<|action_token_278|>": 151952,
+  "<|action_token_279|>": 151953,
+  "<|action_token_27|>": 151701,
+  "<|action_token_280|>": 151954,
+  "<|action_token_281|>": 151955,
+  "<|action_token_282|>": 151956,
+  "<|action_token_283|>": 151957,
+  "<|action_token_284|>": 151958,
+  "<|action_token_285|>": 151959,
+  "<|action_token_286|>": 151960,
+  "<|action_token_287|>": 151961,
+  "<|action_token_288|>": 151962,
+  "<|action_token_289|>": 151963,
+  "<|action_token_28|>": 151702,
+  "<|action_token_290|>": 151964,
+  "<|action_token_291|>": 151965,
+  "<|action_token_292|>": 151966,
+  "<|action_token_293|>": 151967,
+  "<|action_token_294|>": 151968,
+  "<|action_token_295|>": 151969,
+  "<|action_token_296|>": 151970,
+  "<|action_token_297|>": 151971,
+  "<|action_token_298|>": 151972,
+  "<|action_token_299|>": 151973,
+  "<|action_token_29|>": 151703,
+  "<|action_token_2|>": 151676,
+  "<|action_token_300|>": 151974,
+  "<|action_token_301|>": 151975,
+  "<|action_token_302|>": 151976,
+  "<|action_token_303|>": 151977,
+  "<|action_token_304|>": 151978,
+  "<|action_token_305|>": 151979,
+  "<|action_token_306|>": 151980,
+  "<|action_token_307|>": 151981,
+  "<|action_token_308|>": 151982,
+  "<|action_token_309|>": 151983,
+  "<|action_token_30|>": 151704,
+  "<|action_token_310|>": 151984,
+  "<|action_token_311|>": 151985,
+  "<|action_token_312|>": 151986,
+  "<|action_token_313|>": 151987,
+  "<|action_token_314|>": 151988,
+  "<|action_token_315|>": 151989,
+  "<|action_token_316|>": 151990,
+  "<|action_token_317|>": 151991,
+  "<|action_token_318|>": 151992,
+  "<|action_token_319|>": 151993,
+  "<|action_token_31|>": 151705,
+  "<|action_token_320|>": 151994,
+  "<|action_token_321|>": 151995,
+  "<|action_token_322|>": 151996,
+  "<|action_token_323|>": 151997,
+  "<|action_token_324|>": 151998,
+  "<|action_token_325|>": 151999,
+  "<|action_token_326|>": 152000,
+  "<|action_token_327|>": 152001,
+  "<|action_token_328|>": 152002,
+  "<|action_token_329|>": 152003,
+  "<|action_token_32|>": 151706,
+  "<|action_token_330|>": 152004,
+  "<|action_token_331|>": 152005,
+  "<|action_token_332|>": 152006,
+  "<|action_token_333|>": 152007,
+  "<|action_token_334|>": 152008,
+  "<|action_token_335|>": 152009,
+  "<|action_token_336|>": 152010,
+  "<|action_token_337|>": 152011,
+  "<|action_token_338|>": 152012,
+  "<|action_token_339|>": 152013,
+  "<|action_token_33|>": 151707,
+  "<|action_token_340|>": 152014,
+  "<|action_token_341|>": 152015,
+  "<|action_token_342|>": 152016,
+  "<|action_token_343|>": 152017,
+  "<|action_token_344|>": 152018,
+  "<|action_token_345|>": 152019,
+  "<|action_token_346|>": 152020,
+  "<|action_token_347|>": 152021,
+  "<|action_token_348|>": 152022,
+  "<|action_token_349|>": 152023,
+  "<|action_token_34|>": 151708,
+  "<|action_token_350|>": 152024,
+  "<|action_token_351|>": 152025,
+  "<|action_token_352|>": 152026,
+  "<|action_token_353|>": 152027,
+  "<|action_token_354|>": 152028,
+  "<|action_token_355|>": 152029,
+  "<|action_token_356|>": 152030,
+  "<|action_token_357|>": 152031,
+  "<|action_token_358|>": 152032,
+  "<|action_token_359|>": 152033,
+  "<|action_token_35|>": 151709,
+  "<|action_token_360|>": 152034,
+  "<|action_token_361|>": 152035,
+  "<|action_token_362|>": 152036,
+  "<|action_token_363|>": 152037,
+  "<|action_token_364|>": 152038,
+  "<|action_token_365|>": 152039,
+  "<|action_token_366|>": 152040,
+  "<|action_token_367|>": 152041,
+  "<|action_token_368|>": 152042,
+  "<|action_token_369|>": 152043,
+  "<|action_token_36|>": 151710,
+  "<|action_token_370|>": 152044,
+  "<|action_token_371|>": 152045,
+  "<|action_token_372|>": 152046,
+  "<|action_token_373|>": 152047,
+  "<|action_token_374|>": 152048,
+  "<|action_token_375|>": 152049,
+  "<|action_token_376|>": 152050,
+  "<|action_token_377|>": 152051,
+  "<|action_token_378|>": 152052,
+  "<|action_token_379|>": 152053,
+  "<|action_token_37|>": 151711,
+  "<|action_token_380|>": 152054,
+  "<|action_token_381|>": 152055,
+  "<|action_token_382|>": 152056,
+  "<|action_token_383|>": 152057,
+  "<|action_token_384|>": 152058,
+  "<|action_token_385|>": 152059,
+  "<|action_token_386|>": 152060,
+  "<|action_token_387|>": 152061,
+  "<|action_token_388|>": 152062,
+  "<|action_token_389|>": 152063,
+  "<|action_token_38|>": 151712,
+  "<|action_token_390|>": 152064,
+  "<|action_token_391|>": 152065,
+  "<|action_token_392|>": 152066,
+  "<|action_token_393|>": 152067,
+  "<|action_token_394|>": 152068,
+  "<|action_token_395|>": 152069,
+  "<|action_token_396|>": 152070,
+  "<|action_token_397|>": 152071,
+  "<|action_token_398|>": 152072,
+  "<|action_token_399|>": 152073,
+  "<|action_token_39|>": 151713,
+  "<|action_token_3|>": 151677,
+  "<|action_token_400|>": 152074,
+  "<|action_token_401|>": 152075,
+  "<|action_token_402|>": 152076,
+  "<|action_token_403|>": 152077,
+  "<|action_token_404|>": 152078,
+  "<|action_token_405|>": 152079,
+  "<|action_token_406|>": 152080,
+  "<|action_token_407|>": 152081,
+  "<|action_token_408|>": 152082,
+  "<|action_token_409|>": 152083,
+  "<|action_token_40|>": 151714,
+  "<|action_token_410|>": 152084,
+  "<|action_token_411|>": 152085,
+  "<|action_token_412|>": 152086,
+  "<|action_token_413|>": 152087,
+  "<|action_token_414|>": 152088,
+  "<|action_token_415|>": 152089,
+  "<|action_token_416|>": 152090,
+  "<|action_token_417|>": 152091,
+  "<|action_token_418|>": 152092,
+  "<|action_token_419|>": 152093,
+  "<|action_token_41|>": 151715,
+  "<|action_token_420|>": 152094,
+  "<|action_token_421|>": 152095,
+  "<|action_token_422|>": 152096,
+  "<|action_token_423|>": 152097,
+  "<|action_token_424|>": 152098,
+  "<|action_token_425|>": 152099,
+  "<|action_token_426|>": 152100,
+  "<|action_token_427|>": 152101,
+  "<|action_token_428|>": 152102,
+  "<|action_token_429|>": 152103,
+  "<|action_token_42|>": 151716,
+  "<|action_token_430|>": 152104,
+  "<|action_token_431|>": 152105,
+  "<|action_token_432|>": 152106,
+  "<|action_token_433|>": 152107,
+  "<|action_token_434|>": 152108,
+  "<|action_token_435|>": 152109,
+  "<|action_token_436|>": 152110,
+  "<|action_token_437|>": 152111,
+  "<|action_token_438|>": 152112,
+  "<|action_token_439|>": 152113,
+  "<|action_token_43|>": 151717,
+  "<|action_token_440|>": 152114,
+  "<|action_token_441|>": 152115,
+  "<|action_token_442|>": 152116,
+  "<|action_token_443|>": 152117,
+  "<|action_token_444|>": 152118,
+  "<|action_token_445|>": 152119,
+  "<|action_token_446|>": 152120,
+  "<|action_token_447|>": 152121,
+  "<|action_token_448|>": 152122,
+  "<|action_token_449|>": 152123,
+  "<|action_token_44|>": 151718,
+  "<|action_token_450|>": 152124,
+  "<|action_token_451|>": 152125,
+  "<|action_token_452|>": 152126,
+  "<|action_token_453|>": 152127,
+  "<|action_token_454|>": 152128,
+  "<|action_token_455|>": 152129,
+  "<|action_token_456|>": 152130,
+  "<|action_token_457|>": 152131,
+  "<|action_token_458|>": 152132,
+  "<|action_token_459|>": 152133,
+  "<|action_token_45|>": 151719,
+  "<|action_token_460|>": 152134,
+  "<|action_token_461|>": 152135,
+  "<|action_token_462|>": 152136,
+  "<|action_token_463|>": 152137,
+  "<|action_token_464|>": 152138,
+  "<|action_token_465|>": 152139,
+  "<|action_token_466|>": 152140,
+  "<|action_token_467|>": 152141,
+  "<|action_token_468|>": 152142,
+  "<|action_token_469|>": 152143,
+  "<|action_token_46|>": 151720,
+  "<|action_token_470|>": 152144,
+  "<|action_token_471|>": 152145,
+  "<|action_token_472|>": 152146,
+  "<|action_token_473|>": 152147,
+  "<|action_token_474|>": 152148,
+  "<|action_token_475|>": 152149,
+  "<|action_token_476|>": 152150,
+  "<|action_token_477|>": 152151,
+  "<|action_token_478|>": 152152,
+  "<|action_token_479|>": 152153,
+  "<|action_token_47|>": 151721,
+  "<|action_token_480|>": 152154,
+  "<|action_token_481|>": 152155,
+  "<|action_token_482|>": 152156,
+  "<|action_token_483|>": 152157,
+  "<|action_token_484|>": 152158,
+  "<|action_token_485|>": 152159,
+  "<|action_token_486|>": 152160,
+  "<|action_token_487|>": 152161,
+  "<|action_token_488|>": 152162,
+  "<|action_token_489|>": 152163,
+  "<|action_token_48|>": 151722,
+  "<|action_token_490|>": 152164,
+  "<|action_token_491|>": 152165,
+  "<|action_token_492|>": 152166,
+  "<|action_token_493|>": 152167,
+  "<|action_token_494|>": 152168,
+  "<|action_token_495|>": 152169,
+  "<|action_token_496|>": 152170,
+  "<|action_token_497|>": 152171,
+  "<|action_token_498|>": 152172,
+  "<|action_token_499|>": 152173,
+  "<|action_token_49|>": 151723,
+  "<|action_token_4|>": 151678,
+  "<|action_token_500|>": 152174,
+  "<|action_token_501|>": 152175,
+  "<|action_token_502|>": 152176,
+  "<|action_token_503|>": 152177,
+  "<|action_token_504|>": 152178,
+  "<|action_token_505|>": 152179,
+  "<|action_token_506|>": 152180,
+  "<|action_token_507|>": 152181,
+  "<|action_token_508|>": 152182,
+  "<|action_token_509|>": 152183,
+  "<|action_token_50|>": 151724,
+  "<|action_token_510|>": 152184,
+  "<|action_token_511|>": 152185,
+  "<|action_token_512|>": 152186,
+  "<|action_token_513|>": 152187,
+  "<|action_token_514|>": 152188,
+  "<|action_token_515|>": 152189,
+  "<|action_token_516|>": 152190,
+  "<|action_token_517|>": 152191,
+  "<|action_token_518|>": 152192,
+  "<|action_token_519|>": 152193,
+  "<|action_token_51|>": 151725,
+  "<|action_token_520|>": 152194,
+  "<|action_token_521|>": 152195,
+  "<|action_token_522|>": 152196,
+  "<|action_token_523|>": 152197,
+  "<|action_token_524|>": 152198,
+  "<|action_token_525|>": 152199,
+  "<|action_token_526|>": 152200,
+  "<|action_token_527|>": 152201,
+  "<|action_token_528|>": 152202,
+  "<|action_token_529|>": 152203,
+  "<|action_token_52|>": 151726,
+  "<|action_token_530|>": 152204,
+  "<|action_token_531|>": 152205,
+  "<|action_token_532|>": 152206,
+  "<|action_token_533|>": 152207,
+  "<|action_token_534|>": 152208,
+  "<|action_token_535|>": 152209,
+  "<|action_token_536|>": 152210,
+  "<|action_token_537|>": 152211,
+  "<|action_token_538|>": 152212,
+  "<|action_token_539|>": 152213,
+  "<|action_token_53|>": 151727,
+  "<|action_token_540|>": 152214,
+  "<|action_token_541|>": 152215,
+  "<|action_token_542|>": 152216,
+  "<|action_token_543|>": 152217,
+  "<|action_token_544|>": 152218,
+  "<|action_token_545|>": 152219,
+  "<|action_token_546|>": 152220,
+  "<|action_token_547|>": 152221,
+  "<|action_token_548|>": 152222,
+  "<|action_token_549|>": 152223,
+  "<|action_token_54|>": 151728,
+  "<|action_token_550|>": 152224,
+  "<|action_token_551|>": 152225,
+  "<|action_token_552|>": 152226,
+  "<|action_token_553|>": 152227,
+  "<|action_token_554|>": 152228,
+  "<|action_token_555|>": 152229,
+  "<|action_token_556|>": 152230,
+  "<|action_token_557|>": 152231,
+  "<|action_token_558|>": 152232,
+  "<|action_token_559|>": 152233,
+  "<|action_token_55|>": 151729,
+  "<|action_token_560|>": 152234,
+  "<|action_token_561|>": 152235,
+  "<|action_token_562|>": 152236,
+  "<|action_token_563|>": 152237,
+  "<|action_token_564|>": 152238,
+  "<|action_token_565|>": 152239,
+  "<|action_token_566|>": 152240,
+  "<|action_token_567|>": 152241,
+  "<|action_token_568|>": 152242,
+  "<|action_token_569|>": 152243,
+  "<|action_token_56|>": 151730,
+  "<|action_token_570|>": 152244,
+  "<|action_token_571|>": 152245,
+  "<|action_token_572|>": 152246,
+  "<|action_token_573|>": 152247,
+  "<|action_token_574|>": 152248,
+  "<|action_token_575|>": 152249,
+  "<|action_token_576|>": 152250,
+  "<|action_token_577|>": 152251,
+  "<|action_token_578|>": 152252,
+  "<|action_token_579|>": 152253,
+  "<|action_token_57|>": 151731,
+  "<|action_token_580|>": 152254,
+  "<|action_token_581|>": 152255,
+  "<|action_token_582|>": 152256,
+  "<|action_token_583|>": 152257,
+  "<|action_token_584|>": 152258,
+  "<|action_token_585|>": 152259,
+  "<|action_token_586|>": 152260,
+  "<|action_token_587|>": 152261,
+  "<|action_token_588|>": 152262,
+  "<|action_token_589|>": 152263,
+  "<|action_token_58|>": 151732,
+  "<|action_token_590|>": 152264,
+  "<|action_token_591|>": 152265,
+  "<|action_token_592|>": 152266,
+  "<|action_token_593|>": 152267,
+  "<|action_token_594|>": 152268,
+  "<|action_token_595|>": 152269,
+  "<|action_token_596|>": 152270,
+  "<|action_token_597|>": 152271,
+  "<|action_token_598|>": 152272,
+  "<|action_token_599|>": 152273,
+  "<|action_token_59|>": 151733,
+  "<|action_token_5|>": 151679,
+  "<|action_token_600|>": 152274,
+  "<|action_token_601|>": 152275,
+  "<|action_token_602|>": 152276,
+  "<|action_token_603|>": 152277,
+  "<|action_token_604|>": 152278,
+  "<|action_token_605|>": 152279,
+  "<|action_token_606|>": 152280,
+  "<|action_token_607|>": 152281,
+  "<|action_token_608|>": 152282,
+  "<|action_token_609|>": 152283,
+  "<|action_token_60|>": 151734,
+  "<|action_token_610|>": 152284,
+  "<|action_token_611|>": 152285,
+  "<|action_token_612|>": 152286,
+  "<|action_token_613|>": 152287,
+  "<|action_token_614|>": 152288,
+  "<|action_token_615|>": 152289,
+  "<|action_token_616|>": 152290,
+  "<|action_token_617|>": 152291,
+  "<|action_token_618|>": 152292,
+  "<|action_token_619|>": 152293,
+  "<|action_token_61|>": 151735,
+  "<|action_token_620|>": 152294,
+  "<|action_token_621|>": 152295,
+  "<|action_token_622|>": 152296,
+  "<|action_token_623|>": 152297,
+  "<|action_token_624|>": 152298,
+  "<|action_token_625|>": 152299,
+  "<|action_token_626|>": 152300,
+  "<|action_token_627|>": 152301,
+  "<|action_token_628|>": 152302,
+  "<|action_token_629|>": 152303,
+  "<|action_token_62|>": 151736,
+  "<|action_token_630|>": 152304,
+  "<|action_token_631|>": 152305,
+  "<|action_token_632|>": 152306,
+  "<|action_token_633|>": 152307,
+  "<|action_token_634|>": 152308,
+  "<|action_token_635|>": 152309,
+  "<|action_token_636|>": 152310,
+  "<|action_token_637|>": 152311,
+  "<|action_token_638|>": 152312,
+  "<|action_token_639|>": 152313,
+  "<|action_token_63|>": 151737,
+  "<|action_token_640|>": 152314,
+  "<|action_token_641|>": 152315,
+  "<|action_token_642|>": 152316,
+  "<|action_token_643|>": 152317,
+  "<|action_token_644|>": 152318,
+  "<|action_token_645|>": 152319,
+  "<|action_token_646|>": 152320,
+  "<|action_token_647|>": 152321,
+  "<|action_token_648|>": 152322,
+  "<|action_token_649|>": 152323,
+  "<|action_token_64|>": 151738,
+  "<|action_token_650|>": 152324,
+  "<|action_token_651|>": 152325,
+  "<|action_token_652|>": 152326,
+  "<|action_token_653|>": 152327,
+  "<|action_token_654|>": 152328,
+  "<|action_token_655|>": 152329,
+  "<|action_token_656|>": 152330,
+  "<|action_token_657|>": 152331,
+  "<|action_token_658|>": 152332,
+  "<|action_token_659|>": 152333,
+  "<|action_token_65|>": 151739,
+  "<|action_token_660|>": 152334,
+  "<|action_token_661|>": 152335,
+  "<|action_token_662|>": 152336,
+  "<|action_token_663|>": 152337,
+  "<|action_token_664|>": 152338,
+  "<|action_token_665|>": 152339,
+  "<|action_token_666|>": 152340,
+  "<|action_token_667|>": 152341,
+  "<|action_token_668|>": 152342,
+  "<|action_token_669|>": 152343,
+  "<|action_token_66|>": 151740,
+  "<|action_token_670|>": 152344,
+  "<|action_token_671|>": 152345,
+  "<|action_token_672|>": 152346,
+  "<|action_token_673|>": 152347,
+  "<|action_token_674|>": 152348,
+  "<|action_token_675|>": 152349,
+  "<|action_token_676|>": 152350,
+  "<|action_token_677|>": 152351,
+  "<|action_token_678|>": 152352,
+  "<|action_token_679|>": 152353,
+  "<|action_token_67|>": 151741,
+  "<|action_token_680|>": 152354,
+  "<|action_token_681|>": 152355,
+  "<|action_token_682|>": 152356,
+  "<|action_token_683|>": 152357,
+  "<|action_token_684|>": 152358,
+  "<|action_token_685|>": 152359,
+  "<|action_token_686|>": 152360,
+  "<|action_token_687|>": 152361,
+  "<|action_token_688|>": 152362,
+  "<|action_token_689|>": 152363,
+  "<|action_token_68|>": 151742,
+  "<|action_token_690|>": 152364,
+  "<|action_token_691|>": 152365,
+  "<|action_token_692|>": 152366,
+  "<|action_token_693|>": 152367,
+  "<|action_token_694|>": 152368,
+  "<|action_token_695|>": 152369,
+  "<|action_token_696|>": 152370,
+  "<|action_token_697|>": 152371,
+  "<|action_token_698|>": 152372,
+  "<|action_token_699|>": 152373,
+  "<|action_token_69|>": 151743,
+  "<|action_token_6|>": 151680,
+  "<|action_token_700|>": 152374,
+  "<|action_token_701|>": 152375,
+  "<|action_token_702|>": 152376,
+  "<|action_token_703|>": 152377,
+  "<|action_token_704|>": 152378,
+  "<|action_token_705|>": 152379,
+  "<|action_token_706|>": 152380,
+  "<|action_token_707|>": 152381,
+  "<|action_token_708|>": 152382,
+  "<|action_token_709|>": 152383,
+  "<|action_token_70|>": 151744,
+  "<|action_token_710|>": 152384,
+  "<|action_token_711|>": 152385,
+  "<|action_token_712|>": 152386,
+  "<|action_token_713|>": 152387,
+  "<|action_token_714|>": 152388,
+  "<|action_token_715|>": 152389,
+  "<|action_token_716|>": 152390,
+  "<|action_token_717|>": 152391,
+  "<|action_token_718|>": 152392,
+  "<|action_token_719|>": 152393,
+  "<|action_token_71|>": 151745,
+  "<|action_token_720|>": 152394,
+  "<|action_token_721|>": 152395,
+  "<|action_token_722|>": 152396,
+  "<|action_token_723|>": 152397,
+  "<|action_token_724|>": 152398,
+  "<|action_token_725|>": 152399,
+  "<|action_token_726|>": 152400,
+  "<|action_token_727|>": 152401,
+  "<|action_token_728|>": 152402,
+  "<|action_token_729|>": 152403,
+  "<|action_token_72|>": 151746,
+  "<|action_token_730|>": 152404,
+  "<|action_token_731|>": 152405,
+  "<|action_token_732|>": 152406,
+  "<|action_token_733|>": 152407,
+  "<|action_token_734|>": 152408,
+  "<|action_token_735|>": 152409,
+  "<|action_token_736|>": 152410,
+  "<|action_token_737|>": 152411,
+  "<|action_token_738|>": 152412,
+  "<|action_token_739|>": 152413,
+  "<|action_token_73|>": 151747,
+  "<|action_token_740|>": 152414,
+  "<|action_token_741|>": 152415,
+  "<|action_token_742|>": 152416,
+  "<|action_token_743|>": 152417,
+  "<|action_token_744|>": 152418,
+  "<|action_token_745|>": 152419,
+  "<|action_token_746|>": 152420,
+  "<|action_token_747|>": 152421,
+  "<|action_token_748|>": 152422,
+  "<|action_token_749|>": 152423,
+  "<|action_token_74|>": 151748,
+  "<|action_token_750|>": 152424,
+  "<|action_token_751|>": 152425,
+  "<|action_token_752|>": 152426,
+  "<|action_token_753|>": 152427,
+  "<|action_token_754|>": 152428,
+  "<|action_token_755|>": 152429,
+  "<|action_token_756|>": 152430,
+  "<|action_token_757|>": 152431,
+  "<|action_token_758|>": 152432,
+  "<|action_token_759|>": 152433,
+  "<|action_token_75|>": 151749,
+  "<|action_token_760|>": 152434,
+  "<|action_token_761|>": 152435,
+  "<|action_token_762|>": 152436,
+  "<|action_token_763|>": 152437,
+  "<|action_token_764|>": 152438,
+  "<|action_token_765|>": 152439,
+  "<|action_token_766|>": 152440,
+  "<|action_token_767|>": 152441,
+  "<|action_token_768|>": 152442,
+  "<|action_token_769|>": 152443,
+  "<|action_token_76|>": 151750,
+  "<|action_token_770|>": 152444,
+  "<|action_token_771|>": 152445,
+  "<|action_token_772|>": 152446,
+  "<|action_token_773|>": 152447,
+  "<|action_token_774|>": 152448,
+  "<|action_token_775|>": 152449,
+  "<|action_token_776|>": 152450,
+  "<|action_token_777|>": 152451,
+  "<|action_token_778|>": 152452,
+  "<|action_token_779|>": 152453,
+  "<|action_token_77|>": 151751,
+  "<|action_token_780|>": 152454,
+  "<|action_token_781|>": 152455,
+  "<|action_token_782|>": 152456,
+  "<|action_token_783|>": 152457,
+  "<|action_token_784|>": 152458,
+  "<|action_token_785|>": 152459,
+  "<|action_token_786|>": 152460,
+  "<|action_token_787|>": 152461,
+  "<|action_token_788|>": 152462,
+  "<|action_token_789|>": 152463,
+  "<|action_token_78|>": 151752,
+  "<|action_token_790|>": 152464,
+  "<|action_token_791|>": 152465,
+  "<|action_token_792|>": 152466,
+  "<|action_token_793|>": 152467,
+  "<|action_token_794|>": 152468,
+  "<|action_token_795|>": 152469,
+  "<|action_token_796|>": 152470,
+  "<|action_token_797|>": 152471,
+  "<|action_token_798|>": 152472,
+  "<|action_token_799|>": 152473,
+  "<|action_token_79|>": 151753,
+  "<|action_token_7|>": 151681,
+  "<|action_token_800|>": 152474,
+  "<|action_token_801|>": 152475,
+  "<|action_token_802|>": 152476,
+  "<|action_token_803|>": 152477,
+  "<|action_token_804|>": 152478,
+  "<|action_token_805|>": 152479,
+  "<|action_token_806|>": 152480,
+  "<|action_token_807|>": 152481,
+  "<|action_token_808|>": 152482,
+  "<|action_token_809|>": 152483,
+  "<|action_token_80|>": 151754,
+  "<|action_token_810|>": 152484,
+  "<|action_token_811|>": 152485,
+  "<|action_token_812|>": 152486,
+  "<|action_token_813|>": 152487,
+  "<|action_token_814|>": 152488,
+  "<|action_token_815|>": 152489,
+  "<|action_token_816|>": 152490,
+  "<|action_token_817|>": 152491,
+  "<|action_token_818|>": 152492,
+  "<|action_token_819|>": 152493,
+  "<|action_token_81|>": 151755,
+  "<|action_token_820|>": 152494,
+  "<|action_token_821|>": 152495,
+  "<|action_token_822|>": 152496,
+  "<|action_token_823|>": 152497,
+  "<|action_token_824|>": 152498,
+  "<|action_token_825|>": 152499,
+  "<|action_token_826|>": 152500,
+  "<|action_token_827|>": 152501,
+  "<|action_token_828|>": 152502,
+  "<|action_token_829|>": 152503,
+  "<|action_token_82|>": 151756,
+  "<|action_token_830|>": 152504,
+  "<|action_token_831|>": 152505,
+  "<|action_token_832|>": 152506,
+  "<|action_token_833|>": 152507,
+  "<|action_token_834|>": 152508,
+  "<|action_token_835|>": 152509,
+  "<|action_token_836|>": 152510,
+  "<|action_token_837|>": 152511,
+  "<|action_token_838|>": 152512,
+  "<|action_token_839|>": 152513,
+  "<|action_token_83|>": 151757,
+  "<|action_token_840|>": 152514,
+  "<|action_token_841|>": 152515,
+  "<|action_token_842|>": 152516,
+  "<|action_token_843|>": 152517,
+  "<|action_token_844|>": 152518,
+  "<|action_token_845|>": 152519,
+  "<|action_token_846|>": 152520,
+  "<|action_token_847|>": 152521,
+  "<|action_token_848|>": 152522,
+  "<|action_token_849|>": 152523,
+  "<|action_token_84|>": 151758,
+  "<|action_token_850|>": 152524,
+  "<|action_token_851|>": 152525,
+  "<|action_token_852|>": 152526,
+  "<|action_token_853|>": 152527,
+  "<|action_token_854|>": 152528,
+  "<|action_token_855|>": 152529,
+  "<|action_token_856|>": 152530,
+  "<|action_token_857|>": 152531,
+  "<|action_token_858|>": 152532,
+  "<|action_token_859|>": 152533,
+  "<|action_token_85|>": 151759,
+  "<|action_token_860|>": 152534,
+  "<|action_token_861|>": 152535,
+  "<|action_token_862|>": 152536,
+  "<|action_token_863|>": 152537,
+  "<|action_token_864|>": 152538,
+  "<|action_token_865|>": 152539,
+  "<|action_token_866|>": 152540,
+  "<|action_token_867|>": 152541,
+  "<|action_token_868|>": 152542,
+  "<|action_token_869|>": 152543,
+  "<|action_token_86|>": 151760,
+  "<|action_token_870|>": 152544,
+  "<|action_token_871|>": 152545,
+  "<|action_token_872|>": 152546,
+  "<|action_token_873|>": 152547,
+  "<|action_token_874|>": 152548,
+  "<|action_token_875|>": 152549,
+  "<|action_token_876|>": 152550,
+  "<|action_token_877|>": 152551,
+  "<|action_token_878|>": 152552,
+  "<|action_token_879|>": 152553,
+  "<|action_token_87|>": 151761,
+  "<|action_token_880|>": 152554,
+  "<|action_token_881|>": 152555,
+  "<|action_token_882|>": 152556,
+  "<|action_token_883|>": 152557,
+  "<|action_token_884|>": 152558,
+  "<|action_token_885|>": 152559,
+  "<|action_token_886|>": 152560,
+  "<|action_token_887|>": 152561,
+  "<|action_token_888|>": 152562,
+  "<|action_token_889|>": 152563,
+  "<|action_token_88|>": 151762,
+  "<|action_token_890|>": 152564,
+  "<|action_token_891|>": 152565,
+  "<|action_token_892|>": 152566,
+  "<|action_token_893|>": 152567,
+  "<|action_token_894|>": 152568,
+  "<|action_token_895|>": 152569,
+  "<|action_token_896|>": 152570,
+  "<|action_token_897|>": 152571,
+  "<|action_token_898|>": 152572,
+  "<|action_token_899|>": 152573,
+  "<|action_token_89|>": 151763,
+  "<|action_token_8|>": 151682,
+  "<|action_token_900|>": 152574,
+  "<|action_token_901|>": 152575,
+  "<|action_token_902|>": 152576,
+  "<|action_token_903|>": 152577,
+  "<|action_token_904|>": 152578,
+  "<|action_token_905|>": 152579,
+  "<|action_token_906|>": 152580,
+  "<|action_token_907|>": 152581,
+  "<|action_token_908|>": 152582,
+  "<|action_token_909|>": 152583,
+  "<|action_token_90|>": 151764,
+  "<|action_token_910|>": 152584,
+  "<|action_token_911|>": 152585,
+  "<|action_token_912|>": 152586,
+  "<|action_token_913|>": 152587,
+  "<|action_token_914|>": 152588,
+  "<|action_token_915|>": 152589,
+  "<|action_token_916|>": 152590,
+  "<|action_token_917|>": 152591,
+  "<|action_token_918|>": 152592,
+  "<|action_token_919|>": 152593,
+  "<|action_token_91|>": 151765,
+  "<|action_token_920|>": 152594,
+  "<|action_token_921|>": 152595,
+  "<|action_token_922|>": 152596,
+  "<|action_token_923|>": 152597,
+  "<|action_token_924|>": 152598,
+  "<|action_token_925|>": 152599,
+  "<|action_token_926|>": 152600,
+  "<|action_token_927|>": 152601,
+  "<|action_token_928|>": 152602,
+  "<|action_token_929|>": 152603,
+  "<|action_token_92|>": 151766,
+  "<|action_token_930|>": 152604,
+  "<|action_token_931|>": 152605,
+  "<|action_token_932|>": 152606,
+  "<|action_token_933|>": 152607,
+  "<|action_token_934|>": 152608,
+  "<|action_token_935|>": 152609,
+  "<|action_token_936|>": 152610,
+  "<|action_token_937|>": 152611,
+  "<|action_token_938|>": 152612,
+  "<|action_token_939|>": 152613,
+  "<|action_token_93|>": 151767,
+  "<|action_token_940|>": 152614,
+  "<|action_token_941|>": 152615,
+  "<|action_token_942|>": 152616,
+  "<|action_token_943|>": 152617,
+  "<|action_token_944|>": 152618,
+  "<|action_token_945|>": 152619,
+  "<|action_token_946|>": 152620,
+  "<|action_token_947|>": 152621,
+  "<|action_token_948|>": 152622,
+  "<|action_token_949|>": 152623,
+  "<|action_token_94|>": 151768,
+  "<|action_token_950|>": 152624,
+  "<|action_token_951|>": 152625,
+  "<|action_token_952|>": 152626,
+  "<|action_token_953|>": 152627,
+  "<|action_token_954|>": 152628,
+  "<|action_token_955|>": 152629,
+  "<|action_token_956|>": 152630,
+  "<|action_token_957|>": 152631,
+  "<|action_token_958|>": 152632,
+  "<|action_token_959|>": 152633,
+  "<|action_token_95|>": 151769,
+  "<|action_token_960|>": 152634,
+  "<|action_token_961|>": 152635,
+  "<|action_token_962|>": 152636,
+  "<|action_token_963|>": 152637,
+  "<|action_token_964|>": 152638,
+  "<|action_token_965|>": 152639,
+  "<|action_token_966|>": 152640,
+  "<|action_token_967|>": 152641,
+  "<|action_token_968|>": 152642,
+  "<|action_token_969|>": 152643,
+  "<|action_token_96|>": 151770,
+  "<|action_token_970|>": 152644,
+  "<|action_token_971|>": 152645,
+  "<|action_token_972|>": 152646,
+  "<|action_token_973|>": 152647,
+  "<|action_token_974|>": 152648,
+  "<|action_token_975|>": 152649,
+  "<|action_token_976|>": 152650,
+  "<|action_token_977|>": 152651,
+  "<|action_token_978|>": 152652,
+  "<|action_token_979|>": 152653,
+  "<|action_token_97|>": 151771,
+  "<|action_token_980|>": 152654,
+  "<|action_token_981|>": 152655,
+  "<|action_token_982|>": 152656,
+  "<|action_token_983|>": 152657,
+  "<|action_token_984|>": 152658,
+  "<|action_token_985|>": 152659,
+  "<|action_token_986|>": 152660,
+  "<|action_token_987|>": 152661,
+  "<|action_token_988|>": 152662,
+  "<|action_token_989|>": 152663,
+  "<|action_token_98|>": 151772,
+  "<|action_token_990|>": 152664,
+  "<|action_token_991|>": 152665,
+  "<|action_token_992|>": 152666,
+  "<|action_token_993|>": 152667,
+  "<|action_token_994|>": 152668,
+  "<|action_token_995|>": 152669,
+  "<|action_token_996|>": 152670,
+  "<|action_token_997|>": 152671,
+  "<|action_token_998|>": 152672,
+  "<|action_token_999|>": 152673,
+  "<|action_token_99|>": 151773,
+  "<|action_token_9|>": 151683,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|goal_repr|>": 151672,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|obs_repr|>": 151673,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,120 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,131 @@

+{
+  "action_chunk_size": 50,
+  "action_expert_config": {
+    "action_end_token_id": null,
+    "action_start_token_id": 151669,
+    "action_token_id": 151670,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "crl_goal_repr_token_id": 151672,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "image_token_id": 151655,
+    "initializer_range": 0.02,
+    "intermediate_size": 2432,
+    "max_position_embeddings": 262144,
+    "model_type": "prts_qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "video_token_id": 151656,
+    "vision_start_token_id": 151652,
+    "vocab_size": 153722
+  },
+  "action_start_token_id": 151669,
+  "architectures": [
+    "PRTS_Qwen3VL"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_prts_qwen3_vl.PRTS_FlowMatchingConfig_Qwen3VL",
+    "AutoModel": "modeling_prts_qwen3_vl.PRTS_Qwen3VL"
+  },
+  "crl_embed_dim": 256,
+  "crl_encoder_init_w": 0.001,
+  "crl_goal_repr_token_id": 151672,
+  "crl_logsumexp_reg_weight": 0.0,
+  "crl_loss_weight": 1.0,
+  "crl_repr_norm": true,
+  "dtype": "bfloat16",
+  "flow_matching_action_loss_weight": 0.0,
+  "flow_matching_sub_goal_loss_weight": 0.0,
+  "image_token_id": 151655,
+  "label2id": null,
+  "max_action_dim": 32,
+  "model_type": "prts_qwen3_vl",
+  "num_denoise_steps": 5,
+  "pad_token_id": 151643,
+  "text_config": {
+    "action_end_token_id": null,
+    "action_start_token_id": 151669,
+    "action_token_id": 151670,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "crl_goal_repr_token_id": 151672,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "image_token_id": 151655,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "max_position_embeddings": 262144,
+    "model_type": "prts_qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "tie_word_embeddings": true,
+    "use_cache": false,
+    "video_token_id": 151656,
+    "vision_start_token_id": 151652,
+    "vocab_size": 153722
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "use_fast_action_tokenizer": true,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      5,
+      11,
+      17
+    ],
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 2560,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vocab_size": 153722
+}

configuration_prts_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# Copyright 2025 TeleAI Rhodes Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configuration classes for PRTS built on Qwen3-VL."""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLVisionConfig
+class PRTS_Qwen3VLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a PRTS Text Model based on Qwen3-VL.
+    It extends PretrainedConfig with Qwen3-VL text model parameters and PRTS-specific parameters.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3VL model.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            Number of key-value heads for Grouped Query Attention.
+        head_dim (`int`, *optional*, defaults to 128):
+            The dimension of the head.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        image_token_id (`int`, *optional*):
+            Token index used as placeholder for image embeddings.
+        video_token_id (`int`, *optional*):
+            Token index used as placeholder for video embeddings.
+        action_token_id (`int`, *optional*):
+            Token index used as placeholder for action embeddings.
+        action_start_token_id (`int`, *optional*):
+            Token index for action sequence start.
+        action_end_token_id (`int`, *optional*):
+            Token index for action sequence end.
+        vision_start_token_id (`int`, *optional*):
+            Token index for vision sequence start.
+        **kwargs:
+            Additional keyword arguments passed to PretrainedConfig.
+    """
+    model_type = "prts_qwen3_vl_text"   # TODO (zy): check if this is correct
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        # PRTS specific
+        action_token_id=None,
+        action_start_token_id=None,
+        action_end_token_id=None,
+        crl_goal_repr_token_id=None,
+        crl_obs_repr_token_id=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate rope config
+        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
+        # PRTS specific token IDs
+        self.action_token_id = action_token_id
+        self.action_start_token_id = action_start_token_id
+        self.action_end_token_id = action_end_token_id
+        self.crl_goal_repr_token_id = crl_goal_repr_token_id
+        self.crl_obs_repr_token_id = crl_obs_repr_token_id
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class PRTS_FlowMatchingConfig_Qwen3VL(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a PRTS model based on Qwen3-VL.
+    It extends PretrainedConfig with Qwen3-VL model parameters and PRTS-specific parameters for action prediction.
+    [`PRTS_FlowMatchingConfig_Qwen3VL`] is the configuration class to store the configuration of a PRTS model. It is used to
+    instantiate a PRTS model according to the specified arguments, defining the vision encoder, text encoder,
+    action expert, and flow matching components.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `PRTS_Qwen3VLTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        max_action_dim (`int`, *optional*, defaults to 14):
+            Maximum dimension of action vectors. Used for padding different robot action spaces.
+        action_chunk_size (`int`, *optional*, defaults to 100):
+            Number of action timesteps to predict in each forward pass.
+        num_denoise_steps (`int`, *optional*, defaults to 4):
+            Number of denoising steps for flow matching during inference.
+        flow_matching_action_loss_weight (`float`, *optional*, defaults to 1.0):
+            Weight for the flow matching action loss.
+        crl_loss_weight (`float`, *optional*, defaults to 0.0):
+            Weight for the Contrastive Reinforcement Learning (CRL) loss. Set to 0 to disable.
+        crl_embed_dim (`int`, *optional*, defaults to 256):
+            Dimension of the CRL embedding space for action and goal encoders.
+        crl_logsumexp_reg_weight (`float`, *optional*, defaults to 0.0):
+            Weight for logsumexp regularization on CRL logits.
+        image_token_id (`int`, *optional*):
+            Token id for image placeholders.
+        video_token_id (`int`, *optional*):
+            Token id for video placeholders.
+        vision_start_token_id (`int`, *optional*):
+            Token id for vision start marker.
+        vision_end_token_id (`int`, *optional*):
+            Token id for vision end marker.
+        **kwargs:
+            Additional keyword arguments passed to PretrainedConfig.
+    Example:
+    ```python
+    >>> from prts.models import PRTS_FlowMatchingConfig_Qwen3VL, PRTS_Qwen3VL
+    >>> # Initializing a PRTS Qwen3-VL configuration
+    >>> configuration = PRTS_FlowMatchingConfig_Qwen3VL()
+    >>> # Initializing a model from the configuration
+    >>> model = PRTS_Qwen3VL(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "prts_qwen3_vl"
+    sub_configs = {
+        "vision_config": Qwen3VLVisionConfig,
+        "text_config": PRTS_Qwen3VLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        tie_word_embeddings=False,
+        # PRTS specific
+        max_action_dim=32,
+        action_chunk_size=50,
+        num_denoise_steps=4,
+        flow_matching_action_loss_weight=0.,
+        use_fast_action_tokenizer=True,
+        # Embodiment tag: identifies the robot embodiment used for finetuning.
+        # Stores the delta_action_mask key so eval code can recover it without
+        # needing the training dataset config.
+        embodiment_tag=None,
+        # DiT action head config
+        dit_action_head_config=None,
+        # CRL (Contrastive Reinforcement Learning) parameters
+        crl_loss_weight=0.,
+        crl_embed_dim=256,
+        crl_logsumexp_reg_weight=0.0,
+        crl_encoder_init_w=1e-12,  # Cold initialization weight for encoder last layer
+        crl_repr_norm=True,  # Whether to L2-normalize CRL representations
+        **kwargs,
+    ):
+        # Initialize vision config
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        # Initialize text config
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        # PRTS-specific parameters
+        self.max_action_dim = max_action_dim
+        self.action_chunk_size = action_chunk_size
+        self.num_denoise_steps = num_denoise_steps
+        self.flow_matching_action_loss_weight = flow_matching_action_loss_weight
+        self.use_fast_action_tokenizer = use_fast_action_tokenizer
+        self.embodiment_tag = embodiment_tag
+        # DiT action head config (nested dict)
+        # cross_attention_dim defaults to text_config.hidden_size at model init time
+        _default_dit_config = {
+            # Architecture — aligned with GR00T N1.6 (32 layers, inner_dim=32×48=1536)
+            "num_layers": 16,   # 32
+            "num_attention_heads": 32,
+            "attention_head_dim": 48,
+            "output_dim": 1024,
+            # Regularisation
+            "dropout": 0.2,
+            "interleave_self_attention": True,
+            "norm_type": "ada_norm",
+            "final_dropout": True,
+            # Action-head specifics
+            "add_pos_embed": True,
+            # Noise schedule
+            "noise_beta_alpha": 1.5,
+            "noise_beta_beta": 1.0,
+            "noise_s": 0.999,
+            "num_timestep_buckets": 1000,
+            # Attention backend
+            "attn_implementation": "sdpa",
+            # AlternateVLDiT — separate visual / text token cross-attention
+            "use_alternate_vl_dit": True,
+            "attend_text_every_n_blocks": 2,
+            # MoT-style action expert: forwards full VLM ``past_key_values`` into the head;
+            # expert depth defaults to text_config.num_hidden_layers (override with expert_num_layers).
+            "use_mot_action_expert": False,
+            "mlp_mult": 4,  # FFN hidden dim = inner_dim * mlp_mult (standard DiT only)
+        }
+        if dit_action_head_config is not None:
+            _default_dit_config.update(dit_action_head_config)
+        self.dit_action_head_config = _default_dit_config
+        # CRL (Contrastive Reinforcement Learning) parameters
+        self.crl_loss_weight = crl_loss_weight
+        self.crl_embed_dim = crl_embed_dim
+        self.crl_logsumexp_reg_weight = crl_logsumexp_reg_weight
+        self.crl_encoder_init_w = crl_encoder_init_w
+        self.crl_repr_norm = crl_repr_norm
+        # Token IDs
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        # # Propagate token IDs to text config
+        # if self.image_token_id is not None:
+        #     self.text_config.image_token_id = self.image_token_id
+        # if self.video_token_id is not None:
+        #     self.text_config.video_token_id = self.video_token_id
+        # if self.vision_start_token_id is not None:
+        #     self.text_config.vision_start_token_id = self.vision_start_token_id
+        # Ensure vocab sizes are consistent
+        # if hasattr(self.text_config, 'vocab_size'):
+        #     self.vocab_size = self.text_config.vocab_size
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+    # TODO (zy): 这里需要看下是不是在VLConfig传入这些state action的特殊token更合适更灵活
+    @property
+    def action_token_id(self):
+        """Get action token id from text config."""
+        return getattr(self.text_config, 'action_token_id', None)
+    @action_token_id.setter
+    def action_token_id(self, value):
+        """Set action token id in text config."""
+        if hasattr(self.text_config, 'action_token_id'):
+            self.text_config.action_token_id = value
+    def __getattribute__(self, key):
+        if "text_config" in super().__getattribute__("__dict__") and key not in [
+            "dtype",
+            "_attn_implementation_internal",
+        ]:
+            text_config = super().__getattribute__("text_config")
+            if key in text_config.__dict__:
+                return getattr(text_config, key)
+        return super().__getattribute__(key)
+PRTS_FlowMatchingConfig_Qwen3VL.register_for_auto_class()
+__all__ = ["PRTS_FlowMatchingConfig_Qwen3VL", "PRTS_Qwen3VLTextConfig"]

dit_action_head.py ADDED Viewed

	@@ -0,0 +1,1230 @@

+"""
+DiT (Diffusion Transformer) based flow matching action head for PRTS.
+Replaces the Qwen3VLTextModel-based fm_action_expert with a lightweight DiT
+that uses explicit cross-attention to VLM hidden states, following the architecture
+from GR00T / pi05.
+Architecture:
+    ActionEncoder(noisy_actions + dof_mask, timestep)
+    → action_features
+    → DiT(cross-attn to VLM hidden states, ada-norm timestep conditioning)
+    → ActionDecoder → predicted velocity
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Beta
+from typing import Optional
+from transformers.cache_utils import Cache
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+# DIT_PRESETS = {
+#     "DiT-B": {"num_attention_heads": 12, "attention_head_dim": 64, "output_dim": 768},
+#     "DiT-L": {"num_attention_heads": 32, "attention_head_dim": 48, "output_dim": 1536},
+# }
+class SinusoidalPositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding for sequence positions or timesteps."""
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        timesteps = timesteps.float()
+        squeeze = False
+        if timesteps.dim() == 1:
+            timesteps = timesteps.unsqueeze(1)
+            squeeze = True
+        half_dim = self.embedding_dim // 2
+        exponent = -torch.arange(half_dim, dtype=torch.float, device=timesteps.device) * (
+            math.log(10000.0) / half_dim
+        )
+        freqs = timesteps.unsqueeze(-1) * exponent.exp()
+        enc = torch.cat([torch.sin(freqs), torch.cos(freqs)], dim=-1)
+        if squeeze:
+            enc = enc.squeeze(1)
+        return enc
+class TimestepEncoder(nn.Module):
+    """Projects scalar timesteps to embedding space via sinusoidal encoding + MLP."""
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        self.sinusoidal = SinusoidalPositionalEncoding(256)
+        self.linear_1 = nn.Linear(256, embedding_dim)
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(embedding_dim, embedding_dim)
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        t_emb = self.sinusoidal(timesteps)
+        t_emb = self.linear_1(t_emb.to(dtype=self.linear_1.weight.dtype))
+        t_emb = self.act(t_emb)
+        t_emb = self.linear_2(t_emb)
+        return t_emb
+class AdaLayerNorm(nn.Module):
+    """Adaptive Layer Normalization conditioned on timestep embeddings.
+    Applies scale-shift modulation: out = norm(x) * (1 + scale) + shift,
+    where (scale, shift) are linearly projected from the timestep embedding.
+    """
+    def __init__(self, embedding_dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=False)
+    def forward(self, x: torch.Tensor, temb: torch.Tensor) -> torch.Tensor:
+        temb = self.linear(self.silu(temb))
+        scale, shift = temb.chunk(2, dim=-1)
+        x = self.norm(x) * (1 + scale[:, None]) + shift[:, None]
+        return x
+class DiTAttention(nn.Module):
+    """Multi-head attention supporting both self-attention and cross-attention.
+    Supports two backends selected via ``attn_implementation``:
+    * ``"sdpa"`` (default) – uses :func:`F.scaled_dot_product_attention`, which
+      dispatches automatically to FlashAttention / memory-efficient attention
+      depending on the installed PyTorch build.  The encoder padding mask is
+      expanded to ``(B, 1, 1, S)`` and passed as ``attn_mask``.
+    * ``"flash_attention_2"`` – calls the ``flash_attn`` package directly for
+      lower memory usage and higher throughput.  For cross-attention with an
+      encoder padding mask the k/v tensors are unpadded and
+      :func:`flash_attn_varlen_func` is used so that padding tokens are never
+      processed.  For self-attention (no mask) the simpler
+      :func:`flash_attn_func` is used.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        num_heads: int,
+        head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        bias: bool = True,
+        attn_implementation: str = "sdpa",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.attn_implementation = attn_implementation
+        inner_dim = num_heads * head_dim
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        kv_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.to_k = nn.Linear(kv_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(kv_dim, inner_dim, bias=bias)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim, bias=bias),
+            nn.Dropout(dropout),
+        )
+    # ------------------------------------------------------------------
+    # Flash-Attention backend
+    # ------------------------------------------------------------------
+    def _flash_attn_forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """Run Flash Attention via HuggingFace's ``_flash_attention_forward``.
+        Args:
+            q: ``(B, T_q, H, D)``
+            k: ``(B, T_k, H, D)``
+            v: ``(B, T_k, H, D)``
+            attention_mask: ``(B, T_k)`` bool, True = valid token.
+        Returns:
+            ``(B, T_q, H*D)``
+        """
+        B, T_q, H, D = q.shape
+        # _flash_attention_forward returns (B, T_q, H, D); handles unpad/varlen internally.
+        out = _flash_attention_forward(
+            q, k, v,
+            attention_mask=attention_mask,
+            query_length=T_q,
+            is_causal=False,
+            dropout=0.0,
+        )
+        return out.reshape(B, T_q, H * D)
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, T, _ = hidden_states.shape
+        q = self.to_q(hidden_states)
+        kv_input = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        k = self.to_k(kv_input)
+        v = self.to_v(kv_input)
+        if self.attn_implementation == "flash_attention_2":
+            # Flash Attention expects (B, S, H, D)
+            q = q.view(B, T, self.num_heads, self.head_dim)
+            k = k.view(B, -1, self.num_heads, self.head_dim)
+            v = v.view(B, -1, self.num_heads, self.head_dim)
+            attn_output = self._flash_attn_forward(q, k, v, attention_mask)
+        else:
+            # SDPA expects (B, H, S, D)
+            q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+            k = k.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            v = v.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            # Expand (B, S) bool mask → (B, 1, 1, S) for broadcasting.
+            sdpa_mask = None
+            if attention_mask is not None:
+                if attention_mask.dim() == 2:
+                    sdpa_mask = attention_mask[:, None, None, :]
+                else:
+                    sdpa_mask = attention_mask
+            attn_output = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=sdpa_mask, dropout_p=0.0
+            )
+            attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, -1)
+        return self.to_out(attn_output)
+class FeedForward(nn.Module):
+    """Feed-forward network with GELU activation."""
+    def __init__(self, dim: int, dropout: float = 0.0, mult: int = 4):
+        super().__init__()
+        inner_dim = dim * mult
+        self.net = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+class BasicTransformerBlock(nn.Module):
+    """Transformer block with self/cross-attention, optional AdaLayerNorm, and feed-forward.
+    When cross_attention_dim is set, the attention block performs cross-attention
+    to encoder_hidden_states. Otherwise, it performs self-attention.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        norm_type: str = "ada_norm",
+        final_dropout: bool = False,
+        attn_implementation: str = "sdpa",
+    ):
+        super().__init__()
+        self.norm_type = norm_type
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim)
+        else:
+            self.norm1 = nn.LayerNorm(dim)
+        self.attn1 = DiTAttention(
+            query_dim=dim,
+            num_heads=num_attention_heads,
+            head_dim=attention_head_dim,
+            cross_attention_dim=cross_attention_dim,
+            dropout=dropout,
+            attn_implementation=attn_implementation,
+        )
+        self.norm3 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, dropout=dropout)
+        self.final_dropout = nn.Dropout(dropout) if final_dropout else None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, temb)
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+        )
+        if self.final_dropout is not None:
+            attn_output = self.final_dropout(attn_output)
+        hidden_states = attn_output + hidden_states
+        norm_hidden_states = self.norm3(hidden_states)
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class DiT(nn.Module):
+    """Diffusion Transformer with cross-attention to VLM context features.
+    Interleaves cross-attention blocks (attending to encoder_hidden_states)
+    with self-attention blocks when interleave_self_attention=True.
+    Uses AdaLayerNorm for timestep conditioning throughout.
+    Output block applies timestep-conditioned scale-shift before final projection.
+    """
+    def __init__(
+        self,
+        num_attention_heads: int = 12,
+        attention_head_dim: int = 64,
+        output_dim: int = 768,
+        num_layers: int = 12,
+        dropout: float = 0.1,
+        norm_type: str = "ada_norm",
+        final_dropout: bool = True,
+        interleave_self_attention: bool = False,
+        cross_attention_dim: Optional[int] = None,
+        attn_implementation: str = "sdpa",
+    ):
+        super().__init__()
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.output_dim = output_dim
+        self.num_layers = num_layers
+        self.interleave_self_attention = interleave_self_attention
+        self.timestep_encoder = TimestepEncoder(self.inner_dim)
+        all_blocks = []
+        for idx in range(num_layers):
+            use_self_attn = idx % 2 == 1 and interleave_self_attention
+            curr_cross_attention_dim = cross_attention_dim if not use_self_attn else None
+            all_blocks.append(
+                BasicTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=curr_cross_attention_dim,
+                    norm_type=norm_type,
+                    final_dropout=final_dropout,
+                    attn_implementation=attn_implementation,
+                )
+            )
+        self.transformer_blocks = nn.ModuleList(all_blocks)
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+        self.proj_out_2 = nn.Linear(self.inner_dim, output_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        temb = self.timestep_encoder(timestep)
+        hidden_states = hidden_states.contiguous()
+        encoder_hidden_states = encoder_hidden_states.contiguous()
+        for idx, block in enumerate(self.transformer_blocks):
+            if idx % 2 == 1 and self.interleave_self_attention:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    temb=temb,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    temb=temb,
+                )
+        conditioning = temb
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=-1)
+        hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        return self.proj_out_2(hidden_states)
+class AlternateVLDiT(DiT):
+    """DiT variant that separates visual and text tokens during cross-attention.
+    Mirrors GR00T's AlternateVLDiT: even-indexed blocks do cross-attention,
+    alternating every ``attend_text_every_n_blocks`` between text tokens and
+    visual tokens.  Odd-indexed blocks do self-attention (requires
+    ``interleave_self_attention=True``).
+    When no visual tokens are present (``image_mask`` is None or all-False),
+    all valid tokens are treated as text.
+    """
+    def __init__(self, *args, attend_text_every_n_blocks: int = 2, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.interleave_self_attention, (
+            "AlternateVLDiT requires interleave_self_attention=True"
+        )
+        self.attend_text_every_n_blocks = attend_text_every_n_blocks
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        image_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            encoder_attention_mask: (B, S) bool – True = valid VLM token.
+            image_mask: (B, S) bool – True = visual token position.
+                If None, all valid tokens are treated as text.
+        """
+        temb = self.timestep_encoder(timestep)
+        hidden_states = hidden_states.contiguous()
+        encoder_hidden_states = encoder_hidden_states.contiguous()
+        B, S, _ = encoder_hidden_states.shape
+        backbone_mask = (
+            encoder_attention_mask.bool()
+            if encoder_attention_mask is not None
+            else torch.ones(B, S, dtype=torch.bool, device=hidden_states.device)
+        )
+        if image_mask is not None and image_mask.any():
+            vis_mask  = image_mask.bool() & backbone_mask   # visual tokens
+            text_mask = (~image_mask.bool()) & backbone_mask # text tokens
+        else:
+            # No visual tokens – treat everything as text.
+            vis_mask  = torch.zeros_like(backbone_mask)
+            text_mask = backbone_mask
+        for idx, block in enumerate(self.transformer_blocks):
+            if idx % 2 == 1:
+                # Self-attention block.
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    temb=temb,
+                )
+            else:
+                # Cross-attention block: alternate text / visual every N blocks.
+                if idx % (2 * self.attend_text_every_n_blocks) == 0:
+                    curr_mask = text_mask
+                else:
+                    curr_mask = vis_mask
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=curr_mask,
+                    temb=temb,
+                )
+        conditioning = temb
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=-1)
+        hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        return self.proj_out_2(hidden_states)
+class ActionEncoder(nn.Module):
+    """Encodes noisy actions (optionally concatenated with DOF mask) and timestep
+    into hidden features via MLP + sinusoidal time encoding.
+    Architecture: Linear → concat(action_emb, time_emb) → SiLU + Linear → Linear
+    """
+    def __init__(self, action_input_dim: int, hidden_size: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.layer1 = nn.Linear(action_input_dim, hidden_size)
+        self.layer2 = nn.Linear(2 * hidden_size, hidden_size)
+        self.layer3 = nn.Linear(hidden_size, hidden_size)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+    def forward(self, actions: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            actions: (B, T, action_input_dim) noisy actions (+ DOF mask)
+            timesteps: (B,) discretized timesteps
+        """
+        B, T, _ = actions.shape
+        timesteps_expanded = timesteps.unsqueeze(1).expand(-1, T)
+        a_emb = self.layer1(actions)
+        tau_emb = self.pos_encoding(timesteps_expanded).to(dtype=a_emb.dtype)
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = F.silu(self.layer2(x))
+        x = self.layer3(x)
+        return x
+class ActionDecoder(nn.Module):
+    """2-layer MLP that decodes DiT output to action-space velocity."""
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.layer1 = nn.Linear(input_dim, hidden_dim)
+        self.layer2 = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layer2(F.relu(self.layer1(x)))
+class FlowMatchingDiTHead(nn.Module):
+    """Flow matching action head using DiT (Diffusion Transformer).
+    Replaces the fm_action_expert (Qwen3VLTextModel-based) with a DiT that uses
+    explicit cross-attention to VLM hidden states instead of KV cache continuation.
+    Training:
+        1. Sample noise and timestep from Beta distribution
+        2. Compute noisy trajectory: x_t = (1-t)*noise + t*actions
+        3. Compute velocity target: v = actions - noise
+        4. Encode noisy actions + DOF mask + timestep → action features
+        5. Prepend learned future query tokens
+        6. Run DiT with cross-attention to VLM hidden states
+        7. Decode to action-space velocity prediction
+    Inference:
+        Euler integration from pure noise (t=0) to clean actions (t=1)
+        over num_inference_timesteps steps.
+    """
+    def __init__(
+        self,
+        action_dim: int,
+        action_chunk_size: int,
+        cross_attention_dim: int,
+        num_inference_timesteps: int = 4,
+        config: Optional[dict] = None,
+    ):
+        super().__init__()
+        cfg = {
+            "num_layers": 16,
+            "num_attention_heads": 12,
+            "attention_head_dim": 64,
+            "output_dim": 1024,
+            "dropout": 0.2,
+            "interleave_self_attention": True,
+            "norm_type": "ada_norm",
+            "final_dropout": True,
+            "add_pos_embed": True,
+            "noise_beta_alpha": 1.5,
+            "noise_beta_beta": 1.0,
+            "noise_s": 0.999,
+            "num_timestep_buckets": 1000,
+            "attn_implementation": "sdpa",
+            "use_alternate_vl_dit": False,
+            "attend_text_every_n_blocks": 2,
+        }
+        if config is not None:
+            cfg.update(config)
+            # dit_model_type = config.get("dit_model_type")
+            # if dit_model_type and dit_model_type in DIT_PRESETS:
+            #     cfg.update(DIT_PRESETS[dit_model_type])
+        # cfg.pop("dit_model_type", None)
+        self.action_dim = action_dim
+        self.action_chunk_size = action_chunk_size
+        self.num_inference_timesteps = num_inference_timesteps
+        self.num_timestep_buckets = cfg["num_timestep_buckets"]
+        self.noise_s = cfg["noise_s"]
+        self.use_alternate_vl_dit = cfg["use_alternate_vl_dit"]
+        self.add_pos_embed = cfg["add_pos_embed"]
+        num_attention_heads = cfg["num_attention_heads"]
+        attention_head_dim = cfg["attention_head_dim"]
+        output_dim = cfg["output_dim"]
+        inner_dim = num_attention_heads * attention_head_dim
+        dit_kwargs = dict(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            output_dim=output_dim,
+            num_layers=cfg["num_layers"],
+            dropout=cfg["dropout"],
+            norm_type=cfg["norm_type"],
+            final_dropout=cfg["final_dropout"],
+            interleave_self_attention=cfg["interleave_self_attention"],
+            cross_attention_dim=cross_attention_dim,
+            attn_implementation=cfg["attn_implementation"],
+        )
+        if self.use_alternate_vl_dit:
+            self.dit = AlternateVLDiT(
+                **dit_kwargs,
+                attend_text_every_n_blocks=cfg["attend_text_every_n_blocks"],
+            )
+        else:
+            self.dit = DiT(**dit_kwargs)
+        # action_dim * 2: noisy action + DOF mask concatenated
+        self.action_encoder = ActionEncoder(action_dim * 2, inner_dim)
+        self.action_decoder = ActionDecoder(output_dim, inner_dim, action_dim)
+        if self.add_pos_embed:
+            max_seq_len = max(action_chunk_size, 256)
+            self.position_embedding = nn.Embedding(max_seq_len, inner_dim)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        # self.beta_dist = Beta(cfg["noise_beta_alpha"], cfg["noise_beta_beta"])
+        self._beta_alpha = cfg["noise_beta_alpha"]
+        self._beta_beta  = cfg["noise_beta_beta"]
+    def reset_parameters(self):
+        """Re-apply proper initialization.
+        HuggingFace from_pretrained calls _init_weights on modules whose
+        parameters are absent from the checkpoint, overwriting any custom
+        init done in __init__.  Call this after from_pretrained when loading
+        from a base VLM checkpoint that does not contain DiT weights.
+        """
+        if self.add_pos_embed:
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    nn.init.uniform_(module.bias, -bound, bound)
+            elif isinstance(module, nn.LayerNorm):
+                if module.elementwise_affine:
+                    nn.init.ones_(module.weight)
+                    nn.init.zeros_(module.bias)
+    def sample_time(self, batch_size: int, device, dtype) -> torch.Tensor:
+        beta_dist = Beta(self._beta_alpha, self._beta_beta)
+        sample = beta_dist.sample([batch_size]).to(device, dtype=dtype).clamp(max=self.noise_s)
+        return (self.noise_s - sample) / self.noise_s
+    def _encode_actions(
+        self,
+        noisy_actions: torch.Tensor,
+        t_discretized: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor],
+        device,
+    ) -> torch.Tensor:
+        """Encode noisy actions with DOF mask and timestep, add position embeddings."""
+        if action_dof_mask is not None:
+            encoder_input = torch.cat(
+                [noisy_actions, action_dof_mask.to(noisy_actions.dtype)], dim=-1
+            )
+        else:
+            encoder_input = torch.cat(
+                [noisy_actions, torch.ones_like(noisy_actions)], dim=-1
+            )
+        action_features = self.action_encoder(encoder_input, t_discretized)
+        if self.add_pos_embed:
+            pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+            pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+            action_features = action_features + pos_embs
+        return action_features
+    def _dit_forward(
+        self,
+        sa_embs: torch.Tensor,
+        vl_embs: torch.Tensor,
+        t_discretized: torch.LongTensor,
+        encoder_attention_mask: Optional[torch.Tensor],
+        image_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if self.use_alternate_vl_dit:
+            return self.dit(
+                hidden_states=sa_embs,
+                encoder_hidden_states=vl_embs,
+                timestep=t_discretized,
+                encoder_attention_mask=encoder_attention_mask,
+                image_mask=image_mask,
+            )
+        return self.dit(
+            hidden_states=sa_embs,
+            encoder_hidden_states=vl_embs,
+            timestep=t_discretized,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+    def forward(
+        self,
+        vl_embs: torch.Tensor,
+        actions: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        image_mask: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        """Training forward pass.
+        Args:
+            vl_embs: (B, S, D) VLM hidden states for cross-attention
+            actions: (B, T, action_dim) ground truth action trajectories
+            action_dof_mask: (B, T, action_dim) DOF validity mask
+            encoder_attention_mask: (B, S) bool – True = valid VLM token
+            image_mask: (B, S) bool – True = visual token (used by AlternateVLDiT)
+        Returns:
+            (pred_v, velocity): predicted velocity and target velocity, both (B, T, action_dim)
+        """
+        device = vl_embs.device
+        B = actions.shape[0]
+        noise = torch.randn(actions.shape, device=device, dtype=actions.dtype)
+        t = self.sample_time(B, device=device, dtype=actions.dtype)
+        t_expanded = t[:, None, None]
+        noisy_trajectory = (1 - t_expanded) * noise + t_expanded * actions
+        velocity = actions - noise
+        t_discretized = (t * self.num_timestep_buckets).long()
+        action_features = self._encode_actions(noisy_trajectory, t_discretized, action_dof_mask, device)
+        model_output = self._dit_forward(
+            action_features, vl_embs, t_discretized, encoder_attention_mask, image_mask
+        )
+        pred = self.action_decoder(model_output)
+        pred_v = pred[:, :actions.shape[1]]
+        return pred_v, velocity
+    @torch.no_grad()
+    def predict_action(
+        self,
+        vl_embs: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        image_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Inference: denoise actions from noise using Euler integration.
+        Args:
+            vl_embs: (B, S, D) VLM hidden states
+            action_dof_mask: optional (B, T, action_dim) or (1, T, action_dim) DOF mask
+            encoder_attention_mask: (B, S) bool – True = valid VLM token
+            image_mask: (B, S) bool – True = visual token (used by AlternateVLDiT)
+        Returns:
+            (B, T, action_dim) denoised action trajectories
+        """
+        B = vl_embs.shape[0]
+        device = vl_embs.device
+        dtype = vl_embs.dtype
+        actions = torch.randn(
+            (B, self.action_chunk_size, self.action_dim),
+            device=device, dtype=dtype,
+        )
+        dt = 1.0 / self.num_inference_timesteps
+        for step in range(self.num_inference_timesteps):
+            t_cont = step / float(self.num_inference_timesteps)
+            t_discretized_val = int(t_cont * self.num_timestep_buckets)
+            timesteps_tensor = torch.full((B,), t_discretized_val, device=device, dtype=torch.long)
+            action_features = self._encode_actions(actions, timesteps_tensor, action_dof_mask, device)
+            model_output = self._dit_forward(
+                action_features, vl_embs, timesteps_tensor, encoder_attention_mask, image_mask
+            )
+            pred = self.action_decoder(model_output)
+            pred_velocity = pred[:, :self.action_chunk_size]
+            actions = actions + dt * pred_velocity
+        return actions
+# ============================================================================
+# Pi0.5-style KV-cache action expert (VLM K/V concat + GQA + SwiGLU FFN)
+# ============================================================================
+class AdaRMSNorm(nn.Module):
+    """Adaptive RMS normalization: (scale, shift, gate) from cond; zero-init."""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.modulation = nn.Linear(dim, dim * 3)
+        nn.init.zeros_(self.modulation.weight)
+        nn.init.zeros_(self.modulation.bias)
+    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        var = x.float().pow(2).mean(-1, keepdim=True)
+        normed = (x * torch.rsqrt(var + self.eps)).to(x.dtype)
+        scale, shift, gate = self.modulation(cond).chunk(3, dim=-1)
+        normed = normed * (1 + scale[:, None]) + shift[:, None]
+        return normed, gate[:, None]
+class SwiGLUFeedForward(nn.Module):
+    """SiLU(gate_proj(x)) * up_proj(x) → down_proj."""
+    def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.0, bias: bool = True):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=bias)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=bias)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.dropout(F.silu(self.gate_proj(x)) * self.up_proj(x)))
+class MoTAttention(nn.Module):
+    """Action Q attends to concatenated [VLM KV cache ; action KV]; GQA expand for SDPA."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if num_attention_heads % num_kv_heads != 0:
+            raise ValueError(
+                f"num_attention_heads ({num_attention_heads}) must be divisible by "
+                f"num_kv_heads ({num_kv_heads})"
+            )
+        self.num_attention_heads = num_attention_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        q_dim = num_attention_heads * head_dim
+        kv_dim = num_kv_heads * head_dim
+        self.q_proj = nn.Linear(hidden_size, q_dim, bias=bias)
+        self.k_proj = nn.Linear(hidden_size, kv_dim, bias=bias)
+        self.v_proj = nn.Linear(hidden_size, kv_dim, bias=bias)
+        self.o_proj = nn.Linear(q_dim, hidden_size, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        action_hidden: torch.Tensor,
+        vlm_cached_k: torch.Tensor,
+        vlm_cached_v: torch.Tensor,
+        vlm_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, T_a, _ = action_hidden.shape
+        q = self.q_proj(action_hidden)
+        act_k = self.k_proj(action_hidden)
+        act_v = self.v_proj(action_hidden)
+        q = q.view(B, T_a, self.num_attention_heads, self.head_dim).transpose(1, 2)
+        act_k = act_k.view(B, T_a, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        act_v = act_v.view(B, T_a, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        k = torch.cat([vlm_cached_k, act_k], dim=2)
+        v = torch.cat([vlm_cached_v, act_v], dim=2)
+        repeat_factor = self.num_attention_heads // self.num_kv_heads
+        k = k.repeat_interleave(repeat_factor, dim=1)
+        v = v.repeat_interleave(repeat_factor, dim=1)
+        sdpa_mask = None
+        if vlm_attention_mask is not None:
+            action_mask = vlm_attention_mask.new_ones(B, T_a)
+            combined_mask = torch.cat([vlm_attention_mask, action_mask], dim=1)
+            sdpa_mask = combined_mask[:, None, None, :]
+        attn_out = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=sdpa_mask, dropout_p=0.0,
+        )
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, T_a, -1)
+        return self.dropout(self.o_proj(attn_out))
+class MoTBlock(nn.Module):
+    """AdaRMSNorm → attention → gated residual → AdaRMSNorm → SwiGLU FFN → gated residual."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        intermediate_size: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.pre_attn_norm = AdaRMSNorm(hidden_size)
+        self.attn = MoTAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            dropout=dropout,
+        )
+        self.pre_ffn_norm = AdaRMSNorm(hidden_size)
+        self.ffn = SwiGLUFeedForward(hidden_size, intermediate_size, dropout=dropout)
+    def forward(
+        self,
+        action_hidden: torch.Tensor,
+        vlm_cached_k: torch.Tensor,
+        vlm_cached_v: torch.Tensor,
+        adarms_cond: torch.Tensor,
+        vlm_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        normed, gate1 = self.pre_attn_norm(action_hidden, adarms_cond)
+        attn_out = self.attn(normed, vlm_cached_k, vlm_cached_v, vlm_attention_mask)
+        action_hidden = action_hidden + attn_out * gate1
+        normed2, gate2 = self.pre_ffn_norm(action_hidden, adarms_cond)
+        action_hidden = action_hidden + self.ffn(normed2) * gate2
+        return action_hidden
+class MoTDiT(nn.Module):
+    """Stack of ActionBlocks; each block uses one VLM layer's KV pair."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        intermediate_size: int,
+        num_layers: int,
+        dropout: float = 0.2,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.blocks = nn.ModuleList([
+            MoTBlock(
+                hidden_size=hidden_size,
+                num_attention_heads=num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                head_dim=head_dim,
+                intermediate_size=intermediate_size,
+                dropout=dropout,
+            )
+            for _ in range(num_layers)
+        ])
+        self.final_norm = AdaRMSNorm(hidden_size)
+    def forward(
+        self,
+        action_hidden: torch.Tensor,
+        vlm_kv_cache: list,
+        adarms_cond: torch.Tensor,
+        vlm_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for idx, block in enumerate(self.blocks):
+            cached_k, cached_v = vlm_kv_cache[idx]
+            action_hidden = block(
+                action_hidden, cached_k, cached_v, adarms_cond, vlm_attention_mask,
+            )
+        action_hidden, _ = self.final_norm(action_hidden, adarms_cond)
+        return action_hidden
+def _kv_pairs_from_past_key_values(past_key_values: Cache) -> list[tuple[torch.Tensor, torch.Tensor]]:
+    """Per-layer (K, V) from a HuggingFace decoder KV cache (order matches transformer layers)."""
+    return [
+        (past_key_values[i][0], past_key_values[i][1])
+        for i in range(len(past_key_values))
+    ]
+class MoTFlowMatchingHead(nn.Module):
+    """Flow matching head: MoT-style action expert over VLM KV cache (concat + GQA)."""
+    def __init__(
+        self,
+        action_dim: int,
+        action_chunk_size: int,
+        vlm_config,
+        num_inference_timesteps: int = 10,
+        config: Optional[dict] = None,
+    ):
+        super().__init__()
+        _vlm_num_q_heads  = 8 # vlm_config.num_attention_heads // 2 # optional: 8
+        _vlm_num_kv_heads = vlm_config.num_key_value_heads   # 8
+        _vlm_head_dim     = getattr(
+            vlm_config, "head_dim", vlm_config.hidden_size // vlm_config.num_attention_heads
+        )  # 128
+        cfg = {
+            "hidden_size": 1024, # vlm_config.hidden_size // 2,
+            # "hidden_size": vlm_config.hidden_size // 2,
+            "intermediate_size": vlm_config.intermediate_size // 4,
+            "expert_num_layers": vlm_config.num_hidden_layers,
+            # Attention dims default to VLM values (required for KV cache compat)
+            "num_attention_heads": _vlm_num_q_heads,
+            "num_kv_heads": _vlm_num_kv_heads,
+            "head_dim": _vlm_head_dim,
+            # Noise schedule
+            "dropout": 0.2,
+            "add_pos_embed": True,
+            "noise_beta_alpha": 1.5,
+            "noise_beta_beta": 1.0,
+            "noise_s": 0.999,
+            "num_timestep_buckets": 1000,
+        }
+        if config is not None:
+            config = cfg.copy()
+        num_attention_heads = cfg["num_attention_heads"]
+        num_kv_heads        = cfg["num_kv_heads"]
+        head_dim            = cfg["head_dim"]
+        hidden_size         = cfg["hidden_size"]
+        intermediate_size   = cfg["intermediate_size"]
+        num_layers          = cfg["expert_num_layers"]
+        self.action_dim = action_dim
+        self.action_chunk_size = action_chunk_size
+        self.num_inference_timesteps = num_inference_timesteps
+        self.num_timestep_buckets = cfg["num_timestep_buckets"]
+        self.noise_s = cfg["noise_s"]
+        self.add_pos_embed = cfg["add_pos_embed"]
+        self.action_in_proj = nn.Linear(action_dim * 2, hidden_size)
+        self.action_out_proj = nn.Linear(hidden_size, action_dim)
+        self.time_sinusoidal = SinusoidalPositionalEncoding(hidden_size)
+        self.time_mlp_1 = nn.Linear(hidden_size, hidden_size)
+        self.time_mlp_2 = nn.Linear(hidden_size, hidden_size)
+        if self.add_pos_embed:
+            max_seq = max(action_chunk_size, 256)
+            self.position_embedding = nn.Embedding(max_seq, hidden_size)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        self.dit = MoTDiT(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            intermediate_size=intermediate_size,
+            num_layers=num_layers,
+            dropout=cfg["dropout"],
+        )
+        self._beta_alpha = cfg["noise_beta_alpha"]
+        self._beta_beta = cfg["noise_beta_beta"]
+    @property
+    def num_dit_layers(self) -> int:
+        """Number of expert blocks; must match ``len(past_key_values.key_cache)``."""
+        return self.dit.num_layers
+    def _vlm_kv_list_from_past(self, past_key_values: Cache) -> list[tuple[torch.Tensor, torch.Tensor]]:
+        n = len(past_key_values)
+        if n != self.num_dit_layers:
+            raise ValueError(
+                f"MoT expert has {self.num_dit_layers} blocks but `past_key_values` has {n} "
+                "layers. Set `dit_action_head_config['expert_num_layers']` to match "
+                "`text_config.num_hidden_layers`."
+            )
+        return _kv_pairs_from_past_key_values(past_key_values)
+    def reset_parameters(self):
+        """Re-apply proper initialization after from_pretrained."""
+        if self.add_pos_embed:
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        for module in self.modules():
+            if isinstance(module, AdaRMSNorm):
+                nn.init.zeros_(module.modulation.weight)
+                nn.init.zeros_(module.modulation.bias)
+            elif isinstance(module, nn.Linear):
+                nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    nn.init.uniform_(module.bias, -bound, bound)
+    def _compute_adarms_cond(self, t_discretized: torch.Tensor) -> torch.Tensor:
+        t_emb = self.time_sinusoidal(t_discretized.float())
+        t_emb = t_emb.to(dtype=self.time_mlp_1.weight.dtype)
+        t_emb = F.silu(self.time_mlp_1(t_emb))
+        t_emb = F.silu(self.time_mlp_2(t_emb))
+        return t_emb
+    def sample_time(self, batch_size: int, device, dtype) -> torch.Tensor:
+        beta_dist = Beta(self._beta_alpha, self._beta_beta)
+        sample = beta_dist.sample([batch_size]).to(device, dtype=dtype).clamp(max=self.noise_s)
+        return (self.noise_s - sample) / self.noise_s
+    def _prepare_action_embeds(
+        self,
+        noisy_actions: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if action_dof_mask is not None:
+            x = torch.cat(
+                [noisy_actions, action_dof_mask.to(noisy_actions.dtype)], dim=-1,
+            )
+        else:
+            x = torch.cat([noisy_actions, torch.ones_like(noisy_actions)], dim=-1)
+        tokens = self.action_in_proj(x)
+        if self.add_pos_embed:
+            pos_ids = torch.arange(tokens.shape[1], dtype=torch.long, device=noisy_actions.device)
+            tokens = tokens + self.position_embedding(pos_ids).unsqueeze(0)
+        return tokens
+    def forward(
+        self,
+        past_key_values: Cache,
+        actions: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        """Training: returns (pred_velocity, target_velocity).
+        Args:
+            past_key_values: VLM decoder KV cache; layer count must equal ``num_dit_layers``.
+        """
+        vlm_kv_cache = self._vlm_kv_list_from_past(past_key_values)
+        device = actions.device
+        B = actions.shape[0]
+        noise = torch.randn(actions.shape, device=device, dtype=actions.dtype)
+        t = self.sample_time(B, device=device, dtype=actions.dtype)
+        t_expanded = t[:, None, None]
+        noisy_trajectory = (1 - t_expanded) * noise + t_expanded * actions
+        velocity = actions - noise
+        t_discretized = (t * self.num_timestep_buckets).long()
+        adarms_cond = self._compute_adarms_cond(t_discretized)
+        action_tokens = self._prepare_action_embeds(noisy_trajectory, action_dof_mask)
+        output = self.dit(
+            action_tokens, vlm_kv_cache, adarms_cond, encoder_attention_mask,
+        )
+        pred = self.action_out_proj(output)
+        pred_v = pred[:, :actions.shape[1]]
+        return pred_v, velocity
+    def compute_velocity(
+        self,
+        past_key_values: Cache,
+        actions: torch.Tensor,
+        noise: torch.Tensor,
+        t: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Compute velocity prediction for pre-sampled noise and timestep.
+        Used by DiffusionNFT where noise and timestep must be shared between
+        the current policy (v_θ) and the reference policy (v_old).
+        Args:
+            past_key_values: VLM decoder KV cache
+            actions: (B, T, action_dim) ground truth actions (x_0)
+            noise: (B, T, action_dim) pre-sampled noise (ε)
+            t: (B,) continuous timesteps in [0, 1)
+            action_dof_mask, encoder_attention_mask,
+        Returns:
+            pred_v: (B, T, action_dim) predicted velocity
+        """
+        vlm_kv_cache = self._vlm_kv_list_from_past(past_key_values)
+        device = actions.device
+        t_expanded = t[:, None, None]
+        noisy_trajectory = (1 - t_expanded) * noise + t_expanded * actions
+        t_discretized = (t * self.num_timestep_buckets).long()
+        adarms_cond = self._compute_adarms_cond(t_discretized)
+        action_tokens = self._prepare_action_embeds(noisy_trajectory, action_dof_mask)
+        output = self.dit(
+            action_tokens, vlm_kv_cache, adarms_cond, encoder_attention_mask,
+        )
+        pred = self.action_out_proj(output)
+        return pred[:, :actions.shape[1]]
+    @torch.no_grad()
+    def predict_action(
+        self,
+        past_key_values: Cache,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Inference: Euler integration, returns (B, chunk_size, action_dim)."""
+        k0 = past_key_values[0][0]
+        B = k0.shape[0]
+        device = k0.device
+        dtype = k0.dtype
+        vlm_kv_cache = self._vlm_kv_list_from_past(past_key_values)
+        actions = torch.randn(
+            (B, self.action_chunk_size, self.action_dim),
+            device=device, dtype=dtype,
+        )
+        dt = 1.0 / self.num_inference_timesteps
+        for step in range(self.num_inference_timesteps):
+            t_cont = step / float(self.num_inference_timesteps)
+            t_disc_val = int(t_cont * self.num_timestep_buckets)
+            t_tensor = torch.full((B,), t_disc_val, device=device, dtype=torch.long)
+            adarms_cond = self._compute_adarms_cond(t_tensor)
+            action_tokens = self._prepare_action_embeds(actions, action_dof_mask)
+            output = self.dit(
+                action_tokens, vlm_kv_cache, adarms_cond, encoder_attention_mask,
+            )
+            pred_velocity = self.action_out_proj(output)[:, :self.action_chunk_size]
+            actions = actions + dt * pred_velocity
+        return actions

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b8edeb6b51406a1cbca6ef289fc1ee9fb848ffcbb0eaf100916cf3f5580b263
+size 4999639274

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1e8661f341358939418a297d23587849fb79a96ac018a7cd88c1109f39be5c8
+size 4708533880

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,777 @@

+{
+  "metadata": {
+    "total_parameters": 4460513537,
+    "total_size": 9708083714
+  },
+  "weight_map": {
+    "crl_action_encoder.hidden_layers.0.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.hidden_layers.0.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.hidden_layers.1.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.hidden_layers.1.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.hidden_layers.2.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.hidden_layers.2.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.hidden_layers.3.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.hidden_layers.3.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.0.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.0.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.1.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.1.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.2.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.2.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.3.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.layer_norms.3.weight": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.output_proj.bias": "model-00002-of-00002.safetensors",
+    "crl_action_encoder.output_proj.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.0.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.0.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.1.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.1.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.2.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.2.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.3.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.hidden_layers.3.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.0.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.0.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.1.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.1.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.2.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.2.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.3.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.layer_norms.3.weight": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.output_proj.bias": "model-00002-of-00002.safetensors",
+    "crl_goal_encoder.output_proj.weight": "model-00002-of-00002.safetensors",
+    "crl_logit_scale": "model-00001-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.0.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.0.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.1.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.1.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.2.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.2.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.3.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.hidden_layers.3.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.0.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.0.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.1.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.1.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.2.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.2.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.3.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.layer_norms.3.weight": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.output_proj.bias": "model-00002-of-00002.safetensors",
+    "crl_obs_encoder.output_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "language_model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "language_model.norm.weight": "model-00002-of-00002.safetensors",
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00002.safetensors",
+    "visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.linear_fc1.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.linear_fc2.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "visual.merger.norm.bias": "model-00001-of-00002.safetensors",
+    "visual.merger.norm.weight": "model-00001-of-00002.safetensors",
+    "visual.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
+    "visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
+    "visual.pos_embed.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_prts_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,935 @@

+# Copyright 2025 TeleAI Rhodes Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Main VLA model architecture based on Qwen3-VL."""
+from dataclasses import dataclass
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+from typing import Any, Dict, List, Optional, Tuple, Union
+from transformers.modeling_outputs import ModelOutput
+from transformers.cache_utils import Cache
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, is_torchdynamo_compiling
+from .modeling_qwen3_vl import (
+    Qwen3VLForConditionalGeneration,
+    Qwen3VLTextModel,
+    Qwen3VLVisionModel,
+)
+from .configuration_prts_qwen3_vl import PRTS_FlowMatchingConfig_Qwen3VL
+from .dit_action_head import FlowMatchingDiTHead, MoTFlowMatchingHead
+ACTION_DATASET_NAMES = []
+# ----------------------------- Print Customization -----------------------------
+from colorama import init, Fore, Style
+from datetime import datetime
+# Initialize colorama
+init(autoreset=True)
+class CustomPrinter:
+    """Custom colored printer."""
+    # Define message type configuration
+    TYPE_CONFIG = {
+        'normal': {
+            'color': Fore.WHITE,
+            'icon': '',
+            'prefix': '',
+            'style': Style.NORMAL
+        },
+        'important': {
+            'color': Fore.CYAN,
+            'icon': '💡',
+            'prefix': 'IMPORTANT',
+            'style': Style.BRIGHT
+        }
+    }
+    @classmethod
+    def print(cls, message, msg_type='normal', show_time=True, show_icon=True, end='\n'):
+        """
+        Custom print function.
+        Args:
+            message: The message content to print
+            msg_type: Message type ('normal', 'info', 'success', 'warning', 'error', 'fail', 'debug', 'important')
+            show_time: Whether to display a timestamp
+            show_icon: Whether to display the icon
+            end: Line terminator
+        """
+        # Get configuration for the message type
+        config = cls.TYPE_CONFIG.get(msg_type, cls.TYPE_CONFIG['normal'])
+        # Build prefix parts
+        prefix_parts = []
+        # Add timestamp
+        if show_time:
+            timestamp = datetime.now().strftime('%H:%M:%S')
+            prefix_parts.append(f"[{timestamp}]")
+        # Add icon and prefix text
+        icon_text = f"{config['icon']} " if show_icon else ""
+        prefix_parts.append(f"{icon_text}{config['prefix']}")
+        if config['prefix'] == '':
+            full_message = message
+        else:
+            # Combine prefix parts
+            prefix = " ".join(prefix_parts)
+            # Construct full message
+            full_message = f"{prefix}: {message}"
+        # Apply color and style and print
+        formatted_message = f"{config['style']}{config['color']}{full_message}"
+        print(formatted_message, end=end)
+    @classmethod
+    def normal(cls, message, **kwargs):
+        """Convenience: normal-level print."""
+        cls.print(message, 'normal', **kwargs)
+    @classmethod
+    def important(cls, message, **kwargs):
+        """Convenience: important-level print."""
+        cls.print(message, 'important', **kwargs)
+def important(message, **kwargs):
+    CustomPrinter.important(message, **kwargs)
+# -------------------------------------------------------------
+def create_sinusoidal_pos_embedding(
+    time: torch.Tensor,
+    dimension: int,
+    min_period: float = 4e-3,
+    max_period: float = 4.0,
+    device="cpu",
+) -> torch.Tensor:
+    """
+    Computes sine-cosine positional embedding vectors for scalar positions (diffusion timesteps).
+    Args:
+        time: Tensor of shape (batch_size,) containing timestep values
+        dimension: Embedding dimension (must be even)
+        min_period: Minimum period for sinusoidal encoding
+        max_period: Maximum period for sinusoidal encoding
+        device: Device to create tensors on
+    Returns:
+        Positional embeddings of shape (batch_size, dimension)
+    """
+    if dimension % 2 != 0:
+        raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+    if time.ndim != 1:
+        raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+    fraction = torch.linspace(0.0, 1.0, dimension // 2, device=device)
+    period = min_period * (max_period / min_period) ** fraction
+    scaling_factor = 1.0 / period * 2 * math.pi
+    sin_input = scaling_factor[None, :] * time[:, None]
+    pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+    return pos_emb
+class ContrastiveEncoder(nn.Module):
+    """
+    MLP projector for Contrastive Reinforcement Learning (CRL) embeddings.
+    Projects hidden states to a shared latent space for contrastive learning,
+    with L2 normalization for stable similarity computation.
+    Architecture: N-layer MLP with LayerNorm and Swish activation,
+                  followed by a cold-initialized output projection.
+                  [Linear -> LayerNorm -> Swish] x N -> Linear (cold init)
+    Matches stable_contrastive_rl's Q network structure (default: 4 hidden layers).
+    Args:
+        input_dim: Dimension of input hidden states
+        output_dim: Dimension of output embeddings (default: 256)
+        hidden_dim: Dimension of hidden layers (default: 1024)
+        num_layers: Number of hidden layers (default: 4)
+        repr_norm: Whether to L2-normalize outputs (default: False)
+        init_w: Small value for last layer weight initialization for cold init (default: 1e-12)
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int = 256,
+        hidden_dim: int = 1024,
+        num_layers: int = 4,
+        repr_norm: bool = False,
+        init_w: float = 1e-12,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.repr_norm = repr_norm
+        # Build hidden layers with LayerNorm
+        self.hidden_layers = nn.ModuleList()
+        self.layer_norms = nn.ModuleList()
+        for i in range(num_layers):
+            in_dim = input_dim if i == 0 else hidden_dim
+            self.hidden_layers.append(nn.Linear(in_dim, hidden_dim))
+            self.layer_norms.append(nn.LayerNorm(hidden_dim))
+        # Output projection layer with cold initialization
+        self.output_proj = nn.Linear(hidden_dim, output_dim)
+        self.output_proj.weight.data.uniform_(-init_w, init_w)
+        self.output_proj.bias.data.fill_(0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Project input to L2-normalized embedding space.
+        Args:
+            x: Input tensor of shape (batch_size, input_dim)
+        Returns:
+            L2-normalized embeddings of shape (batch_size, output_dim)
+        """
+        # Pass through hidden layers
+        for fc, norm in zip(self.hidden_layers, self.layer_norms):
+            x = fc(x)
+            x = norm(x)
+            x = F.silu(x)
+        # Output projection
+        x = self.output_proj(x)
+        # Optional L2 normalization
+        if self.repr_norm:
+            x = F.normalize(x, dim=-1)
+        return x
+@dataclass
+class PRTS_Qwen3VL_ModelOutputWithPast(ModelOutput):
+    """
+    Output class for PRTS model based on Qwen3-VL.
+    Args:
+        loss: Combined total loss
+        flow_loss: Flow matching loss for action prediction
+        cross_entropy_loss: Standard language modeling loss
+        crl_loss: Contrastive Reinforcement Learning loss for goal-action alignment
+        logits: Language model logits
+        past_key_values: Cached key-value states
+        hidden_states: Hidden states from all layers (if output_hidden_states=True)
+        attentions: Attention weights (if output_attentions=True)
+        rope_deltas: RoPE position delta information
+        channel_loss_dict: Per-dataset loss values for logging
+        channel_loss_count_dict: Per-dataset token counts for loss normalization
+    """
+    loss: Optional[torch.FloatTensor] = None
+    flow_loss: Optional[torch.FloatTensor] = None
+    cross_entropy_loss: Optional[torch.FloatTensor] = None
+    crl_loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+    crl_num_samples: Optional[torch.LongTensor] = None
+    channel_loss_dict: Optional[dict] = None
+    channel_loss_count_dict: Optional[dict] = None
+class PRTS_Qwen3VL(Qwen3VLForConditionalGeneration):
+    """
+    Vision-Language-Action model based on Qwen3-VL.
+    This model extends Qwen3-VL to support:
+    1. Proprioceptive state embedding and prediction
+    2. Sub-task description generation (language format)
+    3. Action chunk prediction via flow matching (continuous actions)
+    4. Optional discrete action tokenization (fast mode)
+    The model uses a flow matching approach for continuous action prediction, with a DiT
+    (Diffusion Transformer) action head that cross-attends to VLM hidden states.
+    """
+    config: PRTS_FlowMatchingConfig_Qwen3VL
+    _tied_weights_keys = ["lm_head.weight"]
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    def __init__(
+        self,
+        config: PRTS_FlowMatchingConfig_Qwen3VL,
+    ):
+        """
+        Initialize the PRTS Qwen3-VL model for action processing.
+        Args:
+            config: Model configuration
+            use_fast_tokenizer (bool): Whether to use FAST tokenizer for discrete actions
+            flow_matching_action_loss_weight (float): Weight for flow matching action loss
+        """
+        super().__init__(config)
+        # The parent class initializes:
+        # - self.visual: Qwen3VLVisionModel
+        # - self.language_model: Qwen3VLTextModel
+        # - self.lm_head: Language model head
+        # - self.rope_deltas: Cached rope deltas
+        # We keep these and add PRTS-specific components
+        # PRTS-specific parameters
+        self.action_dim = config.max_action_dim
+        self.use_fast_tokenizer = config.use_fast_action_tokenizer
+        self.flow_matching_action_loss_weight = config.flow_matching_action_loss_weight
+        # Loss functions
+        self.loss_fct = CrossEntropyLoss(reduction="none")
+        self.loss_mse = MSELoss(reduction="none")
+        # DiT-based flow matching action head: standard (+ AlternateVLDiT) or pi0.5 KV expert
+        self.use_mot_action_expert = config.dit_action_head_config.get(
+            "use_mot_action_expert", False
+        )
+        if config.flow_matching_action_loss_weight > 0.:
+            if self.use_mot_action_expert:
+                self.dit_action_head = MoTFlowMatchingHead(
+                    action_dim=self.action_dim,
+                    action_chunk_size=config.action_chunk_size,
+                    vlm_config=config.text_config,
+                    num_inference_timesteps=config.num_denoise_steps,
+                    config=config.dit_action_head_config,
+                )
+            else:
+                self.dit_action_head = FlowMatchingDiTHead(
+                    action_dim=self.action_dim,
+                    action_chunk_size=config.action_chunk_size,
+                    cross_attention_dim=config.text_config.hidden_size,
+                    num_inference_timesteps=config.num_denoise_steps,
+                    config=config.dit_action_head_config,
+                )
+        # CRL (Contrastive Reinforcement Learning) components
+        if config.crl_loss_weight > 0.:
+            hidden_size = config.text_config.hidden_size
+            # Current encoders (trainable)
+            self.crl_action_encoder = ContrastiveEncoder(
+                input_dim=hidden_size,
+                output_dim=config.crl_embed_dim,
+                init_w=config.crl_encoder_init_w,
+                repr_norm=config.crl_repr_norm,
+            )
+            self.crl_goal_encoder = ContrastiveEncoder(
+                input_dim=hidden_size,
+                output_dim=config.crl_embed_dim,
+                init_w=config.crl_encoder_init_w,
+                repr_norm=config.crl_repr_norm,
+            )
+            # Learnable temperature (log-space for numerical stability, CLIP recipe).
+            self.crl_logit_scale = nn.Parameter(
+                torch.ones([], requires_grad=True) * math.log(1 / 0.2)
+            )
+        # Initialize weights
+        self.post_init()
+        # Print parameter counts
+        visual_params = sum(p.numel() for p in self.visual.parameters())
+        language_params = sum(p.numel() for p in self.language_model.parameters())
+        model_params = visual_params + language_params
+        important(f"Backbone VLM (visual + language_model) parameters: {model_params / 1e6:.2f}M")
+        important(f"Flow Matching Loss coefficient: {self.flow_matching_action_loss_weight}")
+        if config.flow_matching_action_loss_weight > 0.:
+            dit_params = sum(p.numel() for p in self.dit_action_head.parameters())
+            # Get the inner model type name for logging
+            if hasattr(self.dit_action_head, 'dit'):
+                dit_head_type = type(self.dit_action_head.dit).__name__
+            else:
+                dit_head_type = type(self.dit_action_head).__name__
+            important(f"DiT Action Head ({dit_head_type}) parameters: {dit_params / 1e6:.2f}M")
+        if config.crl_loss_weight > 0.:
+            crl_params = sum(p.numel() for p in self.crl_action_encoder.parameters())
+            crl_params += sum(p.numel() for p in self.crl_goal_encoder.parameters())
+            important(f"CRL Encoders (action + goal) parameters: {crl_params / 1e6:.2f}M")
+            important(f"CRL Loss coefficient: {config.crl_loss_weight}")
+            important(f"CRL Encoder init_w: {config.crl_encoder_init_w}")
+            important(f"CRL Repr Norm: {config.crl_repr_norm}")
+        self.fast_action_token_start_idx = 200000
+        self.use_multi_positive = True
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def to_float32_flow_matching_head(self):
+        """Convert flow matching heads to float32 for numerical stability."""
+        if hasattr(self, 'dit_action_head'):
+            self.dit_action_head = self.dit_action_head.to(dtype=torch.float32)
+    def set_fast_action_info(self, action_mapper, fast_action_token_start_idx):
+        """Set information for fast (discrete) action tokenization."""
+        self.action_mapper = action_mapper
+        self.fast_action_token_start_idx = fast_action_token_start_idx
+    def get_placeholder_mask_with_special_token(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        special_features: torch.FloatTensor,
+        special_pad_token_id: int,
+    ):
+        """
+        Get placeholder mask for a specific special token (e.g., state tokens).
+        Similar to get_placeholder_mask but for custom special tokens beyond image/video.
+        """
+        if input_ids is None:
+            special_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(special_pad_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_mask = special_mask.all(-1)
+        else:
+            special_mask = input_ids == special_pad_token_id
+        n_special_tokens = special_mask.sum()
+        special_mask = special_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if special_features is not None and inputs_embeds[special_mask].numel() != special_features.numel():
+            raise ValueError(
+                f"Features and tokens do not match: tokens: {n_special_tokens}, features {special_features.shape[0]}"
+            )
+        return special_mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        # use_cache: Optional[bool] = None,
+        # output_attentions: Optional[bool] = None,
+        # output_hidden_states: Optional[bool] = None,
+        # return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        # rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        actions: Optional[torch.Tensor] = None,
+        action_is_pad: torch.Tensor | None = None,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        dataset_names: Optional[List[str]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, PRTS_Qwen3VL_ModelOutputWithPast]:
+        """
+        Forward pass for PRTS_Qwen3VL model.
+        This extends Qwen3VLForConditionalGeneration.forward with:
+        - State embedding injection
+        - Action chunk flow matching
+        - DeepStack visual feature handling
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        # 1. Prepare input embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_mask = None
+        video_mask = None
+        # 2. Process images with deepstack features
+        deepstack_image_embeds = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw, image_max_seqlen=kwargs['image_max_seqlen'])
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        # 3. Process videos with deepstack features
+        deepstack_video_embeds = None
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        # 4. Aggregate deepstack visual features
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            # aggregate visual_pos_masks and deepstack_visual_embeds
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(inputs_embeds.device)
+        # 7. Calculate position IDs using Qwen3VL's rope index
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        _lm_extra_kwargs: dict = {}
+        _use_cache = (
+            self.use_mot_action_expert
+            and self.flow_matching_action_loss_weight > 0.
+            and actions is not None
+        )
+        vlm_outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=_use_cache,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            output_hidden_states=False,
+            **_lm_extra_kwargs,
+            **kwargs,
+        )
+        vlm_hidden_states = vlm_outputs.last_hidden_state
+        # 11. Run DiT action head if actions are present
+        dit_pred_v = None
+        dit_velocity = None
+        if actions is not None and self.flow_matching_action_loss_weight > 0:
+            # vlm_hidden_states shape: bs, seq_length, hidden_size
+            actions_for_dit = actions.to(vlm_hidden_states.device, dtype=vlm_hidden_states.dtype)
+            dof_mask_for_dit = action_dof_mask.to(vlm_hidden_states.device, dtype=vlm_hidden_states.dtype) if action_dof_mask is not None else None
+            # Pass attention_mask so DiT cross-attention ignores padding tokens
+            dit_encoder_attention_mask = attention_mask.bool() if attention_mask is not None else None
+            if self.use_mot_action_expert and vlm_outputs.past_key_values is not None:
+                dit_pred_v, dit_velocity = self.dit_action_head(
+                    vlm_outputs.past_key_values,
+                    actions_for_dit,
+                    dof_mask_for_dit,
+                    encoder_attention_mask=dit_encoder_attention_mask,
+                )
+            else:
+                # Standard: pass single (last-layer) VLM hidden states
+                dit_image_mask = visual_pos_masks.bool() if visual_pos_masks is not None else None
+                dit_pred_v, dit_velocity = self.dit_action_head(
+                    vlm_hidden_states, actions_for_dit, dof_mask_for_dit,
+                    encoder_attention_mask=dit_encoder_attention_mask,
+                    image_mask=dit_image_mask,
+                )
+        # 12. Compute logits
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(vlm_hidden_states[:, slice_indices, :])
+        # 13. Compute losses
+        loss = None
+        cross_entropy_loss, flow_loss = None, None
+        channel_loss_dict = None
+        channel_loss_count_dict = None
+        if labels is not None:
+            loss = 0
+            action_accuracy = 0
+            unique_datasets_name = list(set(dataset_names)) if dataset_names is not None else []
+            # Compute cross-entropy loss
+            shift_logits = logits[..., :-1, :].float().contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            non_ignored_mask = shift_labels != -100
+            _cross_entropy_loss = self.loss_fct(shift_logits, shift_labels)
+            cross_entropy_loss = (
+                _cross_entropy_loss[non_ignored_mask].mean()
+                if non_ignored_mask.any()
+                else (_cross_entropy_loss.sum() * 0.0)
+            )
+            # Add cross-entropy loss to total
+            if not torch.isnan(cross_entropy_loss):
+                loss += cross_entropy_loss
+            else:
+                with torch.no_grad():
+                    cross_entropy_loss.detach()
+            # Compute action token prediction accuracy (for logging)
+            shift_logits_for_acc = logits[..., :-1, :].contiguous()
+            action_preds = shift_logits_for_acc.argmax(dim=-1)
+            shift_labels_for_acc = labels[..., 1:].contiguous()
+            action_mask = (
+                shift_labels_for_acc >= self.fast_action_token_start_idx
+            )
+            if self.use_fast_tokenizer and action_mask.any():
+                correct_preds = (action_preds == shift_labels_for_acc) & action_mask
+                action_accuracy = (
+                    correct_preds.sum().float() / action_mask.sum().float()
+                )
+                if channel_loss_dict is None:
+                    channel_loss_dict = {}
+                    channel_loss_count_dict = {}
+                channel_loss_dict["action_accuracy"] = action_accuracy.detach()
+                channel_loss_count_dict["action_accuracy"] = torch.tensor(1, device=action_accuracy.device)
+        # 14. Compute flow matching loss (DiT action head)
+        if dit_pred_v is not None and self.flow_matching_action_loss_weight > 0:
+            if channel_loss_dict is not None:
+                channel_loss_dict.update(
+                    {
+                        f"flow_matching/{dataset_name}": torch.tensor(0.0, device=logits.device)
+                        for dataset_name in ACTION_DATASET_NAMES
+                    }
+                )
+                channel_loss_count_dict.update(
+                    {
+                        f"flow_matching/{dataset_name}": torch.tensor(0, device=logits.device)
+                        for dataset_name in ACTION_DATASET_NAMES
+                    }
+                )
+            else:
+                channel_loss_dict = {
+                    f"flow_matching/{dataset_name}": torch.tensor(0.0, device=logits.device)
+                    for dataset_name in ACTION_DATASET_NAMES
+                }
+                channel_loss_count_dict = {
+                    f"flow_matching/{dataset_name}": torch.tensor(0, device=logits.device)
+                    for dataset_name in ACTION_DATASET_NAMES
+                }
+            # Compute flow matching loss: MSE between predicted and target velocity
+            _fm_loss = self.loss_mse(dit_pred_v, dit_velocity)
+            # Apply DOF mask (zero out invalid action dimensions)
+            if action_dof_mask is not None:
+                valid_action_dim = int(action_dof_mask[0, 0, :].sum(dim=-1).item())     # NOTE: only support 单种具身实体数据微调
+                _fm_loss = _fm_loss[:, :, :valid_action_dim]
+            # Apply action_is_pad mask: exclude padding timesteps from loss
+            # action_is_pad: (B, T), True = pad timestep → should not contribute to loss
+            if action_is_pad is not None:
+                valid_timestep_mask = ~action_is_pad[:, :_fm_loss.shape[1]]  # align length
+                _fm_loss = _fm_loss * valid_timestep_mask.unsqueeze(-1)
+                flow_loss = _fm_loss.sum() / (valid_timestep_mask.sum() * _fm_loss.shape[-1])
+            else:
+                flow_loss = _fm_loss.mean()
+            if not torch.isnan(flow_loss):
+                loss = loss + self.flow_matching_action_loss_weight * flow_loss if loss is not None else self.flow_matching_action_loss_weight * flow_loss
+            else:
+                with torch.no_grad():
+                    flow_loss.detach()
+            # Per-dataset flow matching loss logging
+            logging_fm_loss = _fm_loss.detach().mean(dim=(1, 2))  # Sum over chunk_size and action_dim
+            action_dataset_names = dataset_names if dataset_names is not None else []
+            unique_action_datasets = list(set(action_dataset_names))
+            for dataset_name_i in unique_action_datasets:
+                action_dataset_mask = torch.tensor(
+                    [name == dataset_name_i for name in action_dataset_names],
+                    device=logits.device,
+                )
+                if action_dataset_mask.any():
+                    dataset_fm_loss = logging_fm_loss[action_dataset_mask].sum()
+                    dataset_fm_count = action_dataset_mask.sum()
+                    prefixed_key = f"flow_matching/{dataset_name_i}"
+                    channel_loss_dict[prefixed_key] += dataset_fm_loss
+                    channel_loss_count_dict[prefixed_key] += dataset_fm_count
+        elif self.flow_matching_action_loss_weight > 0:
+            # Dummy loss to keep all DiT parameters in computation graph
+            dummy_params = [p.sum() * 0.0 for p in self.dit_action_head.parameters() if p.requires_grad]
+            dummy_loss = sum(dummy_params) if len(dummy_params) > 0 else torch.tensor(0.0, device=logits.device)
+            loss = (loss + dummy_loss) if loss is not None else dummy_loss
+        return PRTS_Qwen3VL_ModelOutputWithPast(
+            loss=loss,
+            cross_entropy_loss=(
+                cross_entropy_loss.detach() if cross_entropy_loss is not None else None
+            ),
+            flow_loss=(
+                flow_loss.detach() if flow_loss is not None else None
+            ),
+            crl_loss=None,
+            logits=logits,
+            past_key_values=vlm_outputs.past_key_values,
+            # hidden_states=vlm_outputs.hidden_states,
+            # attentions=vlm_outputs.attentions,
+            crl_num_samples=None,
+            rope_deltas=self.rope_deltas,
+            channel_loss_dict=channel_loss_dict,
+            channel_loss_count_dict=channel_loss_count_dict,
+        )
+    def embed_prefix(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_values_videos: torch.FloatTensor | None = None,
+        image_grid_thw: torch.LongTensor | None = None,
+        video_grid_thw: torch.LongTensor | None = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
+        """
+        Embed prefix tokens including vision, DeepStack, and (optionally) state features.
+        Returns:
+            (inputs_embeds, visual_pos_masks, deepstack_visual_embeds)
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_mask = None
+        video_mask = None
+        deepstack_image_embeds = None
+        deepstack_video_embeds = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(
+                pixel_values, image_grid_thw,
+                image_max_seqlen=kwargs.get('image_max_seqlen'),
+            )
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        return inputs_embeds, visual_pos_masks, deepstack_visual_embeds
+    @torch.no_grad()
+    def sample_actions(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        cache_position: torch.LongTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_values_videos: torch.FloatTensor | None = None,
+        image_grid_thw: torch.LongTensor | None = None,
+        video_grid_thw: torch.LongTensor | None = None,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Any]:
+        """
+        Sample actions using DiT-based flow matching denoising.
+        1. Computes position_ids via get_rope_index
+        2. Embeds the prefix (with DeepStack visual features)
+        3. Runs the language model to get hidden states
+        4. Uses DiT action head to denoise actions via cross-attention to VLM features
+        Returns:
+            (x_t, outputs) — denoised action trajectories and language-model outputs
+        """
+        if position_ids is None:
+            position_ids, _ = self.get_rope_index(
+                input_ids,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                attention_mask=attention_mask,
+            )
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if inputs_embeds is None:
+            inputs_embeds, visual_pos_masks, deepstack_visual_embeds = self.embed_prefix(
+                input_ids,
+                pixel_values=pixel_values,
+                pixel_values_videos=pixel_values_videos,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                **kwargs,
+            )
+        _sample_use_cache = (
+            self.use_mot_action_expert and self.flow_matching_action_loss_weight > 0
+        )
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=_sample_use_cache,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            output_hidden_states=False,
+        )
+        vlm_hidden_states = outputs.last_hidden_state
+        dit_encoder_attention_mask = attention_mask.bool() if attention_mask is not None else None
+        if self.use_mot_action_expert and outputs.past_key_values is not None:
+            x_t = self.dit_action_head.predict_action(
+                outputs.past_key_values,
+                action_dof_mask,
+                encoder_attention_mask=dit_encoder_attention_mask,
+            )
+        else:
+            dit_image_mask = visual_pos_masks.bool() if visual_pos_masks is not None else None
+            x_t = self.dit_action_head.predict_action(
+                vlm_hidden_states, action_dof_mask,
+                encoder_attention_mask=dit_encoder_attention_mask,
+                image_mask=dit_image_mask,
+            )
+        return x_t, outputs
+PRTS_Qwen3VL.register_for_auto_class()
+__all__ = ["PRTS_Qwen3VL", "PRTS_Qwen3VL_ModelOutputWithPast"]

modeling_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,1645 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3_vl/modular_qwen3_vl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3_vl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import check_model_inputs
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig, Qwen3VLTextConfig, Qwen3VLVisionConfig
+# 在文件头部导入
+try:
+    from qwen_rope_kernel_2 import fused_qwen_rope as fused_qwen_rope_v2
+    HAS_QWEN_ROPE_V2 = True
+except ImportError:
+    print("No qwen_rope_kernel_2 found")
+    HAS_QWEN_ROPE_V2 = False
+try:
+    from fused_rmsnorm import RMSNormModelFunction as _FUSED_RMSFUNC
+    HAS_FUSED_RMSNORM = True
+except ImportError:
+    print("No fused_rmsnorm found")
+    HAS_FUSED_RMSNORM = False
+class Qwen3VLVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=True)
+        self.linear_fc2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.linear_fc2(self.act_fn(self.linear_fc1(hidden_state)))
+class Qwen3VLVisionPatchEmbed(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class Qwen3VLVisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class Qwen3VLVisionPatchMerger(nn.Module):
+    def __init__(self, config: Qwen3VLVisionConfig, use_postshuffle_norm=False) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
+        self.use_postshuffle_norm = use_postshuffle_norm
+        self.norm = nn.LayerNorm(self.hidden_size if use_postshuffle_norm else config.hidden_size, eps=1e-6)
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.act_fn = nn.GELU()
+        self.linear_fc2 = nn.Linear(self.hidden_size, config.out_hidden_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x.view(-1, self.hidden_size) if self.use_postshuffle_norm else x).view(-1, self.hidden_size)
+        x = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
+        return x
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if HAS_QWEN_ROPE_V2 and q.is_cuda and q.dtype == torch.bfloat16 and q.shape[-1] in (64, 128):
+        # qwen_rope_kernel_2 handles (S, D) cos/sin for (S, H, D) input naturally.
+        # The kernel REQUIRES cos/sin to be 2D [S, D] if input is 3D [S, H, D].
+        # It DOES NOT support 3D [S, 1, D] for cos/sin.
+        if cos.dtype != torch.float32:
+            cos = cos.to(torch.float32)
+        if sin.dtype != torch.float32:
+            sin = sin.to(torch.float32)
+        # Proactively squeeze [S, 1, D] -> [S, D] to satisfy kernel requirements
+        # This is a view operation, zero memory copy overhead.
+        if cos.ndim == 3 and cos.shape[1] == 1:
+            cos = cos.squeeze(1)
+            sin = sin.squeeze(1)
+        return fused_qwen_rope_v2(q, cos, sin), fused_qwen_rope_v2(k, cos, sin)
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    if cos.ndim == 2:
+        cos = cos.unsqueeze(-2)
+        sin = sin.unsqueeze(-2)
+    if cos.dtype != torch.float32:
+        cos = cos.to(torch.float32)
+    if sin.dtype != torch.float32:
+        sin = sin.to(torch.float32)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(orig_q_dtype), k_embed.to(orig_k_dtype)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class Qwen3VLVisionAttention(nn.Module):
+    def __init__(self, config: Qwen3VLVisionConfig) -> None:
+        super().__init__()
+        self.dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.dim // self.num_heads
+        self.num_key_value_groups = 1  # needed for eager attention
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.scaling = self.head_dim**-0.5
+        self.config = config
+        self.attention_dropout = 0.0
+        self.is_causal = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states, key_states, value_states = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if self.config._attn_implementation in ["flash_attention_2", "flash_attention_3"]:
+            # Flash Attention 2: Use cu_seqlens for variable length attention
+            if "image_max_seqlen" in kwargs and kwargs["image_max_seqlen"] is not None:
+                max_seqlen = kwargs["image_max_seqlen"]
+            else:
+                max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=None,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                cu_seq_lens_q=cu_seqlens,
+                cu_seq_lens_k=cu_seqlens,
+                max_length_q=max_seqlen,
+                max_length_k=max_seqlen,
+                is_causal=False,
+                **kwargs,
+            )
+        else:
+            # Other implementations: Process each chunk separately
+            lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+            splits = [
+                torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+            ]
+            attn_outputs = [
+                attention_interface(
+                    self,
+                    q,
+                    k,
+                    v,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    is_causal=False,
+                    **kwargs,
+                )[0]
+                for q, k, v in zip(*splits)
+            ]
+            attn_output = torch.cat(attn_outputs, dim=1)
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
+        attn_output = self.proj(attn_output)
+        return attn_output
+class Qwen3VLVisionBlock(GradientCheckpointingLayer):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.attn = Qwen3VLVisionAttention(config=config)
+        self.mlp = Qwen3VLVisionMLP(config=config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen3VLTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: Qwen3VLTextConfig, device=None):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", "default")
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+        self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20])
+    def apply_interleaved_mrope(self, freqs, mrope_section):
+        """Apply interleaved MRoPE to 3D rotary embeddings.
+        Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
+        interleaved [THTHWHTHW...TT], preserving frequency continuity.
+        args:
+            x: (3, bs, seq_len, head_dim // 2)
+            mrope_section: (3,)
+        returns:
+            x_t: (bs, seq_len, head_dim // 2)
+        """
+        freqs_t = freqs[0]  # just overwrite the first dimension T
+        for dim, offset in enumerate((1, 2), start=1):  # H, W
+            length = mrope_section[dim] * 3
+            idx = slice(offset, length, 3)
+            freqs_t[..., idx] = freqs[dim, ..., idx]
+        return freqs_t
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        if position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        device = inv_freq_expanded.device
+        position_ids_expanded = position_ids[:, :, None, :].float().to(device)
+        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(2, 3)
+        freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos() * self.attention_scaling
+        sin = emb.sin() * self.attention_scaling
+        return cos.contiguous(), sin.contiguous()
+@use_kernel_forward_from_hub("RMSNorm")
+class Qwen3VLTextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        Qwen3VLTextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if HAS_FUSED_RMSNORM and hidden_states.is_cuda:
+            x = hidden_states if hidden_states.dtype == torch.bfloat16 else hidden_states.to(torch.bfloat16)
+            x = x.contiguous()
+            return _FUSED_RMSFUNC.apply(x, self.weight, self.variance_epsilon, self.weight.shape[0])
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    if HAS_QWEN_ROPE_V2 and q.is_cuda and q.dtype == torch.bfloat16 and q.shape[-1] in (64, 128):
+        # qwen_rope_kernel_2 handles (S, D) cos/sin for (S, H, D) input naturally.
+        if cos.dtype != torch.float32:
+            cos = cos.to(torch.float32)
+        if sin.dtype != torch.float32:
+            sin = sin.to(torch.float32)
+        return fused_qwen_rope_v2(q, cos, sin), fused_qwen_rope_v2(k, cos, sin)
+    if cos.ndim != q.ndim:
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+    if cos.dtype != q.dtype:
+        cos = cos.to(q.dtype)
+        sin = sin.to(q.dtype)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Qwen3VLTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3VLTextRMSNorm(
+            self.head_dim, eps=config.rms_norm_eps
+        )  # thus post q_norm does not need reshape
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Qwen3VLTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class Qwen3VLTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen3VLTextAttention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3VLTextMLP(config)
+        self.input_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention. DEBUG: When we use packing mode, here we would enter `qwen3vl_forward` in `train_utils.py`
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class Qwen3VLModelOutputWithPast(ModelOutput):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+@auto_docstring
+class Qwen3VLPreTrainedModel(PreTrainedModel):
+    config: Qwen3VLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Qwen3VLTextDecoderLayer,
+        "attentions": Qwen3VLTextAttention,
+    }
+class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
+    config: Qwen3VLVisionConfig
+    _no_split_modules = ["Qwen3VLVisionBlock"]
+    def __init__(self, config, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+        self.patch_embed = Qwen3VLVisionPatchEmbed(
+            config=config,
+        )
+        self.pos_embed = nn.Embedding(config.num_position_embeddings, config.hidden_size)
+        self.num_grid_per_side = int(config.num_position_embeddings**0.5)
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Qwen3VLVisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList([Qwen3VLVisionBlock(config) for _ in range(config.depth)])
+        self.merger = Qwen3VLVisionPatchMerger(
+            config=config,
+            use_postshuffle_norm=False,
+        )
+        self.deepstack_visual_indexes = config.deepstack_visual_indexes
+        self.deepstack_merger_list = nn.ModuleList(
+            [
+                Qwen3VLVisionPatchMerger(
+                    config=config,
+                    use_postshuffle_norm=True,
+                )
+                for _ in range(len(config.deepstack_visual_indexes))
+            ]
+        )
+        self.gradient_checkpointing = False
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        merge_size = self.spatial_merge_size
+        max_hw = int(grid_thw[:, 1:].max().item())
+        freq_table = self.rotary_pos_emb(max_hw)  # (max_hw, dim // 2)
+        device = freq_table.device
+        total_tokens = int(torch.prod(grid_thw, dim=1).sum().item())
+        # pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)
+        pos_ids_cpu = torch.empty((total_tokens, 2) , dtype=torch.long , device="cpu")
+        offset = 0
+        for num_frames, height, width in grid_thw.numpy():
+            merged_h, merged_w = height // merge_size, width // merge_size
+            block_rows = torch.arange(merged_h, device="cpu")  # block row indices
+            block_cols = torch.arange(merged_w, device="cpu")  # block col indices
+            intra_row = torch.arange(merge_size, device="cpu")  # intra-block row offsets
+            intra_col = torch.arange(merge_size, device="cpu")  # intra-block col offsets
+            # Compute full-resolution positions
+            row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
+            col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]
+            row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+            col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+            coords = torch.stack((row_idx, col_idx), dim=-1)
+            if num_frames > 1:
+                coords = coords.repeat(num_frames, 1)
+            num_tokens = coords.shape[0]
+            pos_ids_cpu[offset : offset + num_tokens] = coords
+            offset += num_tokens
+        pos_ids = pos_ids_cpu.to(device , non_blocking=True)
+        embeddings = freq_table[pos_ids]  # lookup rotary embeddings
+        embeddings = embeddings.flatten(1)
+        return embeddings
+    def fast_pos_embed_interpolate(self, grid_thw):
+        # grid_thw 已经是 CPU Tensor，直接解包
+        grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
+        idx_accum = [[] for _ in range(4)]
+        weight_accum = [[] for _ in range(4)]
+        # 预取配置，避免循环内 getattr
+        num_grid = self.num_grid_per_side
+        # 这一步依然需要在 CPU 循环计算，因为 H/W 是变长的，但这只是纯算数，很快
+        for h, w in zip(grid_hs, grid_ws):
+            h_idxs = torch.linspace(0, num_grid - 1, h)
+            w_idxs = torch.linspace(0, num_grid - 1, w)
+            h_idxs_floor = h_idxs.int()
+            w_idxs_floor = w_idxs.int()
+            h_idxs_ceil = (h_idxs_floor + 1).clamp(max=num_grid - 1)
+            w_idxs_ceil = (w_idxs_floor + 1).clamp(max=num_grid - 1)
+            dh = h_idxs - h_idxs_floor
+            dw = w_idxs - w_idxs_floor
+            base_h = h_idxs_floor * num_grid
+            base_h_ceil = h_idxs_ceil * num_grid
+            indices = [
+                (base_h[:, None] + w_idxs_floor[None, :]).flatten(),
+                (base_h[:, None] + w_idxs_ceil[None, :]).flatten(),
+                (base_h_ceil[:, None] + w_idxs_floor[None, :]).flatten(),
+                (base_h_ceil[:, None] + w_idxs_ceil[None, :]).flatten(),
+            ]
+            weights = [
+                ((1 - dh)[:, None] * (1 - dw)[None, :]).flatten(),
+                ((1 - dh)[:, None] * dw[None, :]).flatten(),
+                (dh[:, None] * (1 - dw)[None, :]).flatten(),
+                (dh[:, None] * dw[None, :]).flatten(),
+            ]
+            # 直接 Append Tensor，不做 tolist()
+            for i in range(4):
+                idx_accum[i].append(indices[i])
+                weight_accum[i].append(weights[i])
+        target_device = self.pos_embed.weight.device
+        target_dtype = self.pos_embed.weight.dtype
+        idx_tensor = torch.stack([torch.cat(acc) for acc in idx_accum]).to(device=target_device, dtype=torch.long)
+        weight_tensor = torch.stack([torch.cat(acc) for acc in weight_accum]).to(device=target_device, dtype=target_dtype)
+        pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None]
+        patch_pos_embeds = pos_embeds.sum(dim=0)
+        merge_size = self.config.spatial_merge_size
+        indices_list = []
+        current_offset = 0
+        for t, h, w in zip(grid_ts.tolist(), grid_hs.tolist(), grid_ws.tolist()):
+            local_ids = torch.arange(h * w, device='cpu').view(h, w)
+            local_ids_permuted = (
+                local_ids.view(h // merge_size, merge_size, w // merge_size, merge_size)
+                .permute(0, 2, 1, 3)
+                .reshape(-1)
+            )
+            global_ids = local_ids_permuted + current_offset
+            if t > 1:
+                global_ids = global_ids.repeat(t)
+            indices_list.append(global_ids)
+            current_offset += h * w
+        all_indices = torch.cat(indices_list).to(target_device)
+        patch_pos_embeds = patch_pos_embeds[all_indices]
+        return patch_pos_embeds
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        #move grid_thw to cpu
+        grid_thw_cpu = grid_thw.cpu()
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw_cpu)
+        hidden_states = hidden_states + pos_embeds
+        rotary_pos_emb = self.rot_pos_emb(grid_thw_cpu)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        cos = emb.cos().to(torch.float32).unsqueeze(-2).contiguous()
+        sin = emb.sin().to(torch.float32).unsqueeze(-2).contiguous()
+        cos = cos.to(device=hidden_states.device, non_blocking=True)
+        sin = sin.to(device=hidden_states.device, non_blocking=True)
+        position_embeddings = (cos, sin)
+        #use the grid_thw in gpu
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        cu_seqlens = cu_seqlens.to(device=hidden_states.device)
+        deepstack_feature_lists = []
+        for layer_num, blk in enumerate(self.blocks):
+            if self.gradient_checkpointing and self.training:
+                blk.gradient_checkpointing = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(inputs[0], inputs[1], inputs[2], inputs[3], **inputs[4])
+                    return custom_forward
+                hidden_states = self._gradient_checkpointing_func(
+                    create_custom_forward(blk),
+                    hidden_states,
+                    cu_seqlens,
+                    None,
+                    position_embeddings,
+                    kwargs,
+                )
+            else:
+                hidden_states = blk(
+                    hidden_states,
+                    cu_seqlens=cu_seqlens,
+                    position_embeddings=position_embeddings,
+                    **kwargs,
+                )
+            if layer_num in self.deepstack_visual_indexes:
+                deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](
+                    hidden_states
+                )
+                deepstack_feature_lists.append(deepstack_feature)
+        hidden_states = self.merger(hidden_states)
+        return hidden_states, deepstack_feature_lists
+@auto_docstring(
+    custom_intro=(
+        "Text part of Qwen3VL, "
+        "not a pure text-only model, as DeepStack integrates visual features into the early hidden states."
+    )
+)
+class Qwen3VLTextModel(Qwen3VLPreTrainedModel):
+    config: Qwen3VLTextConfig
+    _no_split_modules = ["Qwen3VLTextDecoderLayer"]
+    def __init__(self, config: Qwen3VLTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen3VLTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3VLTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        # args for deepstack
+        visual_pos_masks: Optional[torch.Tensor] = None,
+        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
+            The mask of the visual positions.
+        deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):
+            The deepstack visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim).
+            The feature is extracted from the different visual encoder layers, and fed to the decoder
+            hidden states. It's from the paper DeepStack(https://arxiv.org/abs/2406.04334).
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)  # (3, bs, seq_length)
+        elif position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        if position_ids.ndim == 3 and position_ids.shape[0] == 4:
+            text_position_ids = position_ids[0]
+            position_ids = position_ids[1:]
+        else:
+            text_position_ids = position_ids[0]
+        # NOTE: Attention! When we use packing mode, this `create_causal_mask` is overwrited, and directly return `attention_mask`.
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=text_position_ids,
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        cos, sin = position_embeddings
+        cos = cos.to(device=hidden_states.device, non_blocking=True).unsqueeze(1).contiguous()
+        sin = sin.to(device=hidden_states.device, non_blocking=True).unsqueeze(1).contiguous()
+        position_embeddings = (cos, sin)
+        # decoder layers
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if self.gradient_checkpointing and self.training:
+                decoder_layer.gradient_checkpointing = False
+                def create_custom_forward(module):  # DEBUG: Here we enter the Qwen3VLTextDecoderLayer forward
+                    def custom_forward(*inputs):
+                        # inputs: hidden_states, position_embeddings, attention_mask, position_ids, past_key_values, use_cache, cache_position, kwargs_dict
+                        return module(
+                            inputs[0],
+                            inputs[1],
+                            attention_mask=inputs[2],
+                            position_ids=inputs[3],
+                            past_key_values=inputs[4],
+                            use_cache=inputs[5],
+                            cache_position=inputs[6],
+                            **inputs[7]
+                        )
+                    return custom_forward
+                layer_outputs = self._gradient_checkpointing_func(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    position_embeddings,
+                    attention_mask,
+                    text_position_ids,
+                    past_key_values,
+                    False, # use_cache
+                    cache_position,
+                    kwargs,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=text_position_ids,
+                    past_key_values=past_key_values,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **kwargs,
+                )
+            hidden_states = layer_outputs
+            # add visual features to the hidden states of first several layers
+            if deepstack_visual_embeds is not None and layer_idx in range(len(deepstack_visual_embeds)):
+                hidden_states = self._deepstack_process(
+                    hidden_states,
+                    visual_pos_masks,
+                    deepstack_visual_embeds[layer_idx],
+                )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+    def _deepstack_process(
+        self, hidden_states: torch.Tensor, visual_pos_masks: torch.Tensor, visual_embeds: torch.Tensor
+    ):
+        visual_pos_masks = visual_pos_masks.to(hidden_states.device)
+        visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype)
+        local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds
+        hidden_states[visual_pos_masks, :] = local_this
+        return hidden_states
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Qwen3VL causal language model (or autoregressive) outputs.
+    """
+)
+class Qwen3VLCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Qwen3VLConfig
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    def __init__(self, config):
+        super().__init__(config)
+        # Directly initialize visual and language_model instead of using Qwen3VLModel
+        self.visual = Qwen3VLVisionModel._from_config(config.vision_config)
+        self.language_model = Qwen3VLTextModel._from_config(config.text_config)
+        self.rope_deltas = None  # cache rope_deltas here
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        self.gradient_checkpointing = True
+        self.visual.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
+        self.language_model.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Different from the original implementation, Qwen3VL use timestamps rather than absolute time position ids."""
+        # Since we use timestamps to seperate videos, like <t1> <vision_start> <frame1> <vision_end> <t2> <vision_start> <frame2> <vision_end>, the video_grid_thw should also be split
+        if video_grid_thw is not None:
+            video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0)
+            video_grid_thw[:, 0] = 1
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode the temporal information for videos)
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        # Same implementation as for images
+        return self.get_image_features(pixel_values_videos, video_grid_thw)
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None, **kwargs):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+        """
+        pixel_values = pixel_values.type(self.visual.dtype)
+        image_embeds, deepstack_feature_lists = self.visual(pixel_values, grid_thw=image_grid_thw, **kwargs)
+        split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds, deepstack_feature_lists
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: Optional[torch.FloatTensor] = None,
+        video_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+        return special_image_mask, special_video_mask
+    @check_model_inputs()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Qwen3VLCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        Example:
+            TODO: Add example
+        """
+        # Inlined from Qwen3VLModel.forward
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_mask = None
+        video_mask = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw, image_max_seqlen=kwargs.get("image_max_seqlen"))
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            # aggregate visual_pos_masks and deepstack_visual_embeds
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                # Only apply conversion for floating point tensors (inverted masks)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        if kwargs.get("max_seqlen") is not None:
+            try:
+                self.language_model.config.max_seqlen = int(kwargs.get("max_seqlen"))
+            except Exception:
+                self.language_model.config.max_seqlen = kwargs.get("max_seqlen")
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+        return Qwen3VLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            rope_deltas=self.rope_deltas,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        # Qwen3VL position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+        return model_inputs
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        if inputs_embeds is not None:
+            vision_start_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(vision_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            image_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            video_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(video_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+        else:
+            vision_start_mask = input_ids == vision_start_token_id
+            image_mask = input_ids == image_token_id
+            video_mask = input_ids == video_token_id
+        vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
+        image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
+        video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
+        return image_nums, video_nums
+    def _expand_inputs_for_generation(
+        self,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> tuple[torch.LongTensor, dict[str, Any]]:
+        # Overwritten -- Support for expanding tensors without a batch size dimension
+        # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
+        # pixel_values.shape[0] is sum(seqlen_images for samples)
+        # image_grid_thw.shape[0] is sum(num_images for samples)
+        if expand_size == 1:
+            return input_ids, model_kwargs
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
+        def _expand_dict_for_generation_visual(dict_to_expand):
+            image_grid_thw = model_kwargs.get("image_grid_thw", None)
+            video_grid_thw = model_kwargs.get("video_grid_thw", None)
+            image_nums, video_nums = self._get_image_nums_and_video_nums(
+                input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
+            )
+            def _repeat_interleave_samples(x, lengths, repeat_times):
+                samples = torch.split(x, lengths)
+                repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
+                return result
+            for key in dict_to_expand:
+                if key == "pixel_values":
+                    # split images into samples
+                    samples = torch.split(image_grid_thw, list(image_nums))
+                    # compute the sequence length of images for each sample
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "image_grid_thw":
+                    # get the num of images for each sample
+                    lengths = list(image_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "pixel_values_videos":
+                    samples = torch.split(video_grid_thw, list(video_nums))
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "video_grid_thw":
+                    lengths = list(video_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "second_per_grid_ts":
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
+                    )
+            return dict_to_expand
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and key not in visual_keys
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+        model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+        return input_ids, model_kwargs
+__all__ = [
+    "Qwen3VLVisionModel",
+    "Qwen3VLForConditionalGeneration",
+    "Qwen3VLPreTrainedModel",
+    "Qwen3VLTextModel",
+]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_prts_qwen3_vl.PRTS_Qwen3VLProcessor"
+  },
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_pixels": 147456,
+  "merge_size": 2,
+  "min_pixels": 65536,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "PRTS_Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 147456,
+    "shortest_edge": 65536
+  },
+  "temporal_patch_size": 2
+}

processing_prts_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright 2025 TeleAI Rhodes Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor for PRTS built on Qwen3-VL (hub / trust_remote_code; no prts package required)."""
+from __future__ import annotations
+import logging
+from typing import Optional, Union
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (
+    ImagesKwargs,
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils.logging import get_logger
+from transformers.video_utils import VideoInput
+ACTION_START_TOKEN = "<|action_start|>"
+ACTION_PLACEHOLDER_TOKEN = "<|action_pad|>"
+ACTION_END_TOKEN = "<|action_end|>"
+CRL_GOAL_REPR_TOKEN = "<|goal_repr|>"
+CRL_OBS_REPR_TOKEN = "<|obs_repr|>"
+VISION_START_TOKEN = "<|vision_start|>"         # beginning of vision input
+IMAGE_PLACEHOLDER_TOKEN = "<|image_pad|>"       # image placeholder
+VIDEO_PLACEHOLDER_TOKEN = "<|video_pad|>"       # video placeholder
+logger = get_logger(__name__)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.INFO)
+    handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
+    logger.addHandler(handler)
+class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
+    pass
+class Qwen3VLImagesKwargs(ImagesKwargs):
+    min_pixels: Optional[int]
+    max_pixels: Optional[int]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen3VLImagesKwargs
+    videos_kwargs: Qwen3VLVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_token_type_ids": False,
+            "return_mm_token_type_ids": False,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+class PRTS_Qwen3VLProcessor(ProcessorMixin):
+    r"""
+    Constructs a PRTS processor which wraps a Qwen3-VL image processor and a Qwen2 tokenizer into a single processor.
+    This processor is built independently (not inheriting from Qwen3VLProcessor) to avoid tight coupling,
+    while maintaining compatibility with Qwen3-VL's timestamp-based video processing approach.
+    [`PRTS_Qwen3VLProcessor`] offers all the functionalities needed for PRTS model with:
+    - Action token handling (discrete and continuous)
+    - State token handling for proprioceptive inputs
+    - Expert trigger tokens for flow matching action prediction
+    - Qwen3-VL compatible image/video processing with timestamp-based video handling
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`Qwen3VLVideoProcessor`], *optional*):
+            The video processor is a required input.
+        chat_template (`str`, *optional*):
+            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None,
+                 chat_template=None, **kwargs):
+        # Initialize base ProcessorMixin
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        # Get image/video tokens from tokenizer
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
+        # Qwen3-VL vision tokens
+        self.vision_start_token = (
+            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
+        )
+        self.vision_end_token = (
+            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
+        )
+        self.vision_start_token_id = (
+            tokenizer.vision_start_token_id
+            if getattr(tokenizer, "vision_start_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_start_token)
+        )
+        self.vision_end_token_id = (
+            tokenizer.vision_end_token_id
+            if getattr(tokenizer, "vision_end_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_end_token)
+        )
+        prts_special_tokens = [
+            ACTION_START_TOKEN,
+            ACTION_PLACEHOLDER_TOKEN,
+            ACTION_END_TOKEN,
+            CRL_GOAL_REPR_TOKEN,
+            CRL_OBS_REPR_TOKEN,
+        ]
+        num_new_tokens = tokenizer.add_tokens(prts_special_tokens, special_tokens=True)
+        logger.info(f"Added {num_new_tokens} new special tokens to the tokenizer.")
+        self.action_token = getattr(tokenizer, "action_token", ACTION_PLACEHOLDER_TOKEN)
+        self.action_token_id = tokenizer.convert_tokens_to_ids(self.action_token)
+        token_dict = {
+            "action_start_token_id": ACTION_START_TOKEN,
+            "action_token_id": ACTION_PLACEHOLDER_TOKEN,
+            "vision_start_token_id": VISION_START_TOKEN,
+            "image_token_id": IMAGE_PLACEHOLDER_TOKEN,
+            "video_token_id": VIDEO_PLACEHOLDER_TOKEN,
+            "crl_goal_repr_token_id": CRL_GOAL_REPR_TOKEN,
+            "crl_obs_repr_token_id": CRL_OBS_REPR_TOKEN,
+        }
+        self.token_ids = {key: tokenizer.convert_tokens_to_ids(value) for key, value in token_dict.items()}
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos: Optional[VideoInput] = None,
+        actions: Union[torch.Tensor] = None,
+        **kwargs: Unpack[Qwen3VLProcessorKwargs],
+    ) -> BatchFeature:
+        output_kwargs = self._merge_kwargs(
+            Qwen3VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_grid_thw = None
+        videos_inputs = {}
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata", None)
+            else:
+                video_metadata = videos_inputs.get("video_metadata", None)
+        else:
+            video_grid_thw = None
+            video_metadata = None
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy()
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        if video_grid_thw is not None:
+            merge_length = self.video_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    if video_metadata is not None and index < len(video_metadata):
+                        metadata = video_metadata[index]
+                        if metadata.fps is None:
+                            logger.warning_once(
+                                "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
+                                "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
+                                "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
+                            )
+                            metadata.fps = 24 if metadata.fps is None else metadata.fps
+                        curr_timestamp = self._calculate_timestamps(
+                            metadata.frames_indices,
+                            metadata.fps,
+                            self.video_processor.merge_size,
+                        )
+                        video_placeholder = ""
+                        frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
+                        for frame_idx in range(video_grid_thw[index][0]):
+                            curr_time = curr_timestamp[frame_idx]
+                            video_placeholder += f"<{curr_time:.1f} seconds>"
+                            video_placeholder += (
+                                self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
+                            )
+                        if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
+                            text[i] = text[i].replace(
+                                f"{self.vision_start_token}{self.video_token}{self.vision_end_token}",
+                                video_placeholder,
+                                1,
+                            )
+                        else:
+                            text[i] = text[i].replace(self.video_token, video_placeholder, 1)
+                    else:
+                        num_video_tokens = video_grid_thw[index].prod() // merge_length
+                        text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        output_data = {**text_inputs, **image_inputs, **videos_inputs}
+        if actions is not None:
+            output_data["actions"] = actions
+        return BatchFeature(data=output_data, tensor_type=return_tensors)
+    def _calculate_timestamps(self, indices: Union[list[int], np.ndarray], video_fps: float, merge_size: int = 2):
+        if not isinstance(indices, list):
+            indices = indices.tolist()
+        if len(indices) % merge_size != 0:
+            indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
+        timestamps = [idx / video_fps for idx in indices]
+        timestamps = [
+            (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size)
+        ]
+        return timestamps
+    def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        if video_sizes is not None:
+            videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {})
+            videos_kwargs.update(kwargs)
+            merge_size = videos_kwargs.get("merge_size", None) or self.video_processor.merge_size
+            num_video_patches = [
+                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
+                for video_size in video_sizes
+            ]
+            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
+            vision_data["num_video_tokens"] = num_video_tokens
+        return MultiModalData(**vision_data)
+    def set_action_tokenizer(self, action_tokenizer):
+        self.action_tokenizer = action_tokenizer
+        prts_fast_action_tokens = [f"<|action_token_{i}|>" for i in range(action_tokenizer.vocab_size)]
+        num_new_tokens = self.tokenizer.add_tokens(prts_fast_action_tokens, special_tokens=True)
+        logger.info(f"Added {num_new_tokens} FAST action tokens to the tokenizer.")
+        self.action_token_start_index = self.tokenizer.convert_tokens_to_ids("<|action_token_0|>")
+        self.action_vocab_size = action_tokenizer.vocab_size
+        token_ids = self.tokenizer.convert_tokens_to_ids(prts_fast_action_tokens)
+        self.action_mapper = {k: v for k, v in zip(prts_fast_action_tokens, token_ids, strict=True)}
+    def preprocess_action(self, actions, **kwargs):
+        raise NotImplementedError
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+PRTS_Qwen3VLProcessor.register_for_auto_class()
+__all__ = ["PRTS_Qwen3VLProcessor"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5482df2482307db564c0595428d3dfdad4bf5dbd9d3d5156052ca12f93b7d3ed
+size 11828002

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:400af616c02e6ae8f34a358781f2a5d2158b3110c8a0c48d6f9e536c95fdc133
+size 9809

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": true,
+  "fps": 2.0,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_frames": 8,
+  "merge_size": 2,
+  "min_frames": 4,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "PRTS_Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 147456,
+    "shortest_edge": 65536
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen3VLVideoProcessor"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff