diff --git a/.gitattributes b/.gitattributes index 1380ecd3a8155bf422776b6ebd7932fc7ac904ee..70ecdc65c3a9da10fcd6fc3962eab94dd7bd0d53 100644 --- a/.gitattributes +++ b/.gitattributes @@ -39,3 +39,5 @@ kd_mllm/s1_siglip2_qwen3_4b/v1-20260320-102316/checkpoint-2181/tokenizer.json fi kd_mllm/s2_siglip2_qwen3_4b_10pct/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text video_mllm_swift/s1_declip_siglip2_qwen3_1.7b/v0-20260314-141147/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text video_mllm_swift/s1_declip_siglip2_qwen3_1.7b/v0-20260314-141147/checkpoint-2181/tokenizer.json filter=lfs diff=lfs merge=lfs -text +video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2181/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/args.json b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7ad716daa308581ab36f680aa193aa1bf3ff023b --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/args.json @@ -0,0 +1,382 @@ +{ + "output_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153", + "per_device_train_batch_size": 8, + "num_train_epochs": 1.0, + "max_steps": -1, + "learning_rate": 0.001, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_steps": 0, + "optim": "adamw_torch_fused", + "optim_args": null, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "optim_target_modules": null, + "gradient_accumulation_steps": 4, + "average_tokens_across_devices": true, + "max_grad_norm": 1.0, + "label_smoothing_factor": 0.0, + "bf16": true, + "fp16": false, + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "use_liger_kernel": false, + "liger_kernel_config": null, + "use_cache": false, + "neftune_noise_alpha": null, + "torch_empty_cache_steps": null, + "auto_find_batch_size": false, + "logging_strategy": "steps", + "logging_steps": 1, + "logging_first_step": true, + "log_on_each_node": true, + "logging_nan_inf_filter": true, + "include_num_input_tokens_seen": false, + "log_level": "passive", + "log_level_replica": "warning", + "disable_tqdm": null, + "report_to": [ + "none" + ], + "run_name": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153", + "project": "huggingface", + "trackio_space_id": "trackio", + "eval_strategy": "no", + "eval_steps": 500.0, + "eval_delay": 0, + "per_device_eval_batch_size": 1, + "prediction_loss_only": false, + "eval_on_start": false, + "eval_do_concat_batches": true, + "eval_use_gather_object": false, + "eval_accumulation_steps": null, + "include_for_metrics": [], + "batch_eval_metrics": false, + "save_only_model": false, + "save_strategy": "steps", + "save_steps": 500.0, + "save_on_each_node": false, + "save_total_limit": 2, + "enable_jit_checkpoint": false, + "push_to_hub": false, + "hub_token": null, + "hub_private_repo": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_always_push": false, + "hub_revision": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "restore_callback_states_from_checkpoint": false, + "full_determinism": false, + "seed": 42, + "data_seed": 42, + "use_cpu": false, + "accelerator_config": { + "dispatch_batches": false + }, + "parallelism_config": null, + "dataloader_drop_last": false, + "dataloader_num_workers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "dataloader_prefetch_factor": null, + "remove_unused_columns": true, + "label_names": null, + "train_sampling_strategy": "random", + "length_column_name": "length", + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "ddp_backend": null, + "ddp_timeout": 7200, + "fsdp": [], + "fsdp_config": null, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "debug": null, + "skip_memory_metrics": true, + "do_train": false, + "do_eval": false, + "do_predict": false, + "resume_from_checkpoint": null, + "warmup_ratio": 0.03, + "logging_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/runs", + "local_rank": 0, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "tuner_backend": "peft", + "vit_gradient_checkpointing": null, + "router_aux_loss_coef": 0.0, + "enable_dft_loss": false, + "enable_channel_loss": false, + "safe_serialization": true, + "max_shard_size": "5GB", + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "group_by_length": false, + "max_epochs": null, + "aligner_lr": null, + "vit_lr": null, + "use_logits_to_keep": null, + "ds3_gather_for_generation": true, + "resume_only_model": false, + "optimizer": null, + "loss_type": null, + "eval_metric": null, + "callbacks": [], + "early_stop_interval": null, + "eval_use_evalscope": false, + "eval_dataset": [], + "eval_dataset_args": null, + "eval_limit": null, + "eval_generation_config": null, + "extra_eval_args": null, + "tuner_type": "full", + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "use_flash_ckpt": false, + "use_ray": false, + "ray_exp_name": null, + "device_groups": null, + "model": "Qwen/Qwen3-1.7B", + "model_type": "llava_siglip2_qwen3", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "experts_impl": null, + "new_special_tokens": [], + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "max_model_len": null, + "local_repo_path": null, + "init_strategy": null, + "template": "llava_siglip2_qwen3", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "use_chat_template": true, + "padding_side": "right", + "padding_free": false, + "loss_scale": "default", + "sequence_parallel_size": 1, + "template_backend": "swift", + "response_prefix": null, + "enable_thinking": null, + "add_non_thinking_prefix": true, + "dataset": [ + "vmllm_s1_558k" + ], + "val_dataset": [], + "cached_dataset": [], + "cached_val_dataset": [], + "split_dataset_ratio": 0.0, + "dataset_num_proc": 16, + "load_from_cache_file": false, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": null, + "model_author": null, + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "structured_outputs_regex": null, + "train_type": null, + "adapters": [], + "external_plugins": [ + "video_mllm/model_plugin.py", + "video_mllm/dataset_plugin.py" + ], + "custom_register_path": [], + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "packing": false, + "packing_length": null, + "packing_num_proc": 1, + "lazy_tokenize": true, + "use_hf": true, + "ignore_args_error": false, + "use_swift_lora": false, + "freeze_parameters": [ + "model.language_model", + "lm_head", + "model.vision_tower" + ], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [ + "model.multi_modal_projector" + ], + "trainable_parameters_regex": null, + "freeze_llm": true, + "freeze_vit": true, + "freeze_aligner": false, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "target_parameters": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": "ms-swift", + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_notification_method": null, + "swanlab_webhook_url": null, + "swanlab_secret": null, + "swanlab_sender_email": null, + "swanlab_receiver_email": null, + "swanlab_smtp_server": null, + "swanlab_smtp_port": null, + "swanlab_email_language": "zh", + "swanlab_mode": "cloud", + "add_version": true, + "create_checkpoint_symlink": false, + "zero_hpz_partition_size": null, + "deepspeed_autotp_size": null, + "swift_version": "4.1.0.dev0", + "ckpt_dir": null, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "Qwen3-1.7B", + "model_info": "ModelInfo(model_type='llava_siglip2_qwen3', model_dir='/home/tiger/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e', torch_dtype=torch.bfloat16, max_model_len=40960, quant_method=None, quant_bits=None, rope_scaling={'rope_theta': 1000000, 'rope_type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='llava_siglip2_qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None)], template=None, ignore_patterns=None, requires=None, tags=[])], loader=, template='llava_siglip2_qwen3', model_arch=MultiModelKeys(arch_name='llava_hf', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.multi_modal_projector'], vision_tower=['model.vision_tower'], generator=[]), architectures=['LlavaOnevisionForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=[], tags=['vision', 'video'])", + "model_dir": "/home/tiger/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e", + "template_meta": "QwenTemplateMeta(template_type='llava_siglip2_qwen3', prefix=[], prompt=['<|im_start|>user\\n{{QUERY}}<|im_end|>\\n<|im_start|>assistant\\n'], chat_sep=['<|im_end|>\\n'], suffix=['<|im_end|>\\n'], template_cls=, system_prefix=['<|im_start|>system\\n{{SYSTEM}}<|im_end|>\\n'], default_system=None, auto_add_bos=False, stop_words=['<|endoftext|>'], agent_template='hermes', is_thinking=False, thinking_prefix='', non_thinking_prefix='', history_thinking_prefix='')", + "_val_dataset_exists": false, + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153', per_device_train_batch_size=8, num_train_epochs=1.0, max_steps=-1, learning_rate=0.001, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_steps=0.03, optim=, optim_args=None, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, optim_target_modules=None, gradient_accumulation_steps=4, average_tokens_across_devices=None, max_grad_norm=1.0, label_smoothing_factor=0.0, bf16=True, fp16=False, bf16_full_eval=False, fp16_full_eval=False, tf32=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, use_liger_kernel=False, liger_kernel_config=None, use_cache=False, neftune_noise_alpha=None, torch_empty_cache_steps=None, auto_find_batch_size=False, logging_strategy=, logging_steps=1, logging_first_step=True, log_on_each_node=True, logging_nan_inf_filter=True, include_num_input_tokens_seen=None, log_level='passive', log_level_replica='warning', disable_tqdm=False, report_to=[], run_name='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153', project='huggingface', trackio_space_id='trackio', eval_strategy=, eval_steps=500.0, eval_delay=0, per_device_eval_batch_size=1, prediction_loss_only=False, eval_on_start=False, eval_do_concat_batches=True, eval_use_gather_object=False, eval_accumulation_steps=None, include_for_metrics=[], batch_eval_metrics=False, save_only_model=False, save_strategy=, save_steps=500, save_on_each_node=False, save_total_limit=2, enable_jit_checkpoint=False, push_to_hub=False, hub_token=None, hub_private_repo=None, hub_model_id=None, hub_strategy=, hub_always_push=False, hub_revision=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, restore_callback_states_from_checkpoint=False, full_determinism=False, seed=42, data_seed=42, use_cpu=False, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, dataloader_drop_last=False, dataloader_num_workers=1, dataloader_pin_memory=True, dataloader_persistent_workers=False, dataloader_prefetch_factor=2, remove_unused_columns=False, label_names=None, train_sampling_strategy='random', length_column_name='length', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, ddp_backend=None, ddp_timeout=7200, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, debug=[], skip_memory_metrics=True, do_train=False, do_eval=False, do_predict=False, resume_from_checkpoint=None, warmup_ratio=0.03, logging_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/runs', local_rank=0, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, safe_serialization=True, max_shard_size='5GB', check_model=True, acc_strategy='token', train_dataloader_shuffle=True, group_by_length=False, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='full', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)" +} \ No newline at end of file diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/args.json b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/args.json new file mode 100644 index 0000000000000000000000000000000000000000..7ad716daa308581ab36f680aa193aa1bf3ff023b --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/args.json @@ -0,0 +1,382 @@ +{ + "output_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153", + "per_device_train_batch_size": 8, + "num_train_epochs": 1.0, + "max_steps": -1, + "learning_rate": 0.001, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_steps": 0, + "optim": "adamw_torch_fused", + "optim_args": null, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "optim_target_modules": null, + "gradient_accumulation_steps": 4, + "average_tokens_across_devices": true, + "max_grad_norm": 1.0, + "label_smoothing_factor": 0.0, + "bf16": true, + "fp16": false, + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": null, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "use_liger_kernel": false, + "liger_kernel_config": null, + "use_cache": false, + "neftune_noise_alpha": null, + "torch_empty_cache_steps": null, + "auto_find_batch_size": false, + "logging_strategy": "steps", + "logging_steps": 1, + "logging_first_step": true, + "log_on_each_node": true, + "logging_nan_inf_filter": true, + "include_num_input_tokens_seen": false, + "log_level": "passive", + "log_level_replica": "warning", + "disable_tqdm": null, + "report_to": [ + "none" + ], + "run_name": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153", + "project": "huggingface", + "trackio_space_id": "trackio", + "eval_strategy": "no", + "eval_steps": 500.0, + "eval_delay": 0, + "per_device_eval_batch_size": 1, + "prediction_loss_only": false, + "eval_on_start": false, + "eval_do_concat_batches": true, + "eval_use_gather_object": false, + "eval_accumulation_steps": null, + "include_for_metrics": [], + "batch_eval_metrics": false, + "save_only_model": false, + "save_strategy": "steps", + "save_steps": 500.0, + "save_on_each_node": false, + "save_total_limit": 2, + "enable_jit_checkpoint": false, + "push_to_hub": false, + "hub_token": null, + "hub_private_repo": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_always_push": false, + "hub_revision": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "restore_callback_states_from_checkpoint": false, + "full_determinism": false, + "seed": 42, + "data_seed": 42, + "use_cpu": false, + "accelerator_config": { + "dispatch_batches": false + }, + "parallelism_config": null, + "dataloader_drop_last": false, + "dataloader_num_workers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "dataloader_prefetch_factor": null, + "remove_unused_columns": true, + "label_names": null, + "train_sampling_strategy": "random", + "length_column_name": "length", + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "ddp_backend": null, + "ddp_timeout": 7200, + "fsdp": [], + "fsdp_config": null, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "debug": null, + "skip_memory_metrics": true, + "do_train": false, + "do_eval": false, + "do_predict": false, + "resume_from_checkpoint": null, + "warmup_ratio": 0.03, + "logging_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/runs", + "local_rank": 0, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "tuner_backend": "peft", + "vit_gradient_checkpointing": null, + "router_aux_loss_coef": 0.0, + "enable_dft_loss": false, + "enable_channel_loss": false, + "safe_serialization": true, + "max_shard_size": "5GB", + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "group_by_length": false, + "max_epochs": null, + "aligner_lr": null, + "vit_lr": null, + "use_logits_to_keep": null, + "ds3_gather_for_generation": true, + "resume_only_model": false, + "optimizer": null, + "loss_type": null, + "eval_metric": null, + "callbacks": [], + "early_stop_interval": null, + "eval_use_evalscope": false, + "eval_dataset": [], + "eval_dataset_args": null, + "eval_limit": null, + "eval_generation_config": null, + "extra_eval_args": null, + "tuner_type": "full", + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "use_flash_ckpt": false, + "use_ray": false, + "ray_exp_name": null, + "device_groups": null, + "model": "Qwen/Qwen3-1.7B", + "model_type": "llava_siglip2_qwen3", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "experts_impl": null, + "new_special_tokens": [], + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "max_model_len": null, + "local_repo_path": null, + "init_strategy": null, + "template": "llava_siglip2_qwen3", + "system": null, + "max_length": 4096, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "use_chat_template": true, + "padding_side": "right", + "padding_free": false, + "loss_scale": "default", + "sequence_parallel_size": 1, + "template_backend": "swift", + "response_prefix": null, + "enable_thinking": null, + "add_non_thinking_prefix": true, + "dataset": [ + "vmllm_s1_558k" + ], + "val_dataset": [], + "cached_dataset": [], + "cached_val_dataset": [], + "split_dataset_ratio": 0.0, + "dataset_num_proc": 16, + "load_from_cache_file": false, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": null, + "model_author": null, + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "structured_outputs_regex": null, + "train_type": null, + "adapters": [], + "external_plugins": [ + "video_mllm/model_plugin.py", + "video_mllm/dataset_plugin.py" + ], + "custom_register_path": [], + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "packing": false, + "packing_length": null, + "packing_num_proc": 1, + "lazy_tokenize": true, + "use_hf": true, + "ignore_args_error": false, + "use_swift_lora": false, + "freeze_parameters": [ + "model.language_model", + "lm_head", + "model.vision_tower" + ], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [ + "model.multi_modal_projector" + ], + "trainable_parameters_regex": null, + "freeze_llm": true, + "freeze_vit": true, + "freeze_aligner": false, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "target_parameters": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": "ms-swift", + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_notification_method": null, + "swanlab_webhook_url": null, + "swanlab_secret": null, + "swanlab_sender_email": null, + "swanlab_receiver_email": null, + "swanlab_smtp_server": null, + "swanlab_smtp_port": null, + "swanlab_email_language": "zh", + "swanlab_mode": "cloud", + "add_version": true, + "create_checkpoint_symlink": false, + "zero_hpz_partition_size": null, + "deepspeed_autotp_size": null, + "swift_version": "4.1.0.dev0", + "ckpt_dir": null, + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "Qwen3-1.7B", + "model_info": "ModelInfo(model_type='llava_siglip2_qwen3', model_dir='/home/tiger/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e', torch_dtype=torch.bfloat16, max_model_len=40960, quant_method=None, quant_bits=None, rope_scaling={'rope_theta': 1000000, 'rope_type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='llava_siglip2_qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None)], template=None, ignore_patterns=None, requires=None, tags=[])], loader=, template='llava_siglip2_qwen3', model_arch=MultiModelKeys(arch_name='llava_hf', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.multi_modal_projector'], vision_tower=['model.vision_tower'], generator=[]), architectures=['LlavaOnevisionForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=[], tags=['vision', 'video'])", + "model_dir": "/home/tiger/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e", + "template_meta": "QwenTemplateMeta(template_type='llava_siglip2_qwen3', prefix=[], prompt=['<|im_start|>user\\n{{QUERY}}<|im_end|>\\n<|im_start|>assistant\\n'], chat_sep=['<|im_end|>\\n'], suffix=['<|im_end|>\\n'], template_cls=, system_prefix=['<|im_start|>system\\n{{SYSTEM}}<|im_end|>\\n'], default_system=None, auto_add_bos=False, stop_words=['<|endoftext|>'], agent_template='hermes', is_thinking=False, thinking_prefix='', non_thinking_prefix='', history_thinking_prefix='')", + "_val_dataset_exists": false, + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153', per_device_train_batch_size=8, num_train_epochs=1.0, max_steps=-1, learning_rate=0.001, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_steps=0.03, optim=, optim_args=None, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, optim_target_modules=None, gradient_accumulation_steps=4, average_tokens_across_devices=None, max_grad_norm=1.0, label_smoothing_factor=0.0, bf16=True, fp16=False, bf16_full_eval=False, fp16_full_eval=False, tf32=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, use_liger_kernel=False, liger_kernel_config=None, use_cache=False, neftune_noise_alpha=None, torch_empty_cache_steps=None, auto_find_batch_size=False, logging_strategy=, logging_steps=1, logging_first_step=True, log_on_each_node=True, logging_nan_inf_filter=True, include_num_input_tokens_seen=None, log_level='passive', log_level_replica='warning', disable_tqdm=False, report_to=[], run_name='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153', project='huggingface', trackio_space_id='trackio', eval_strategy=, eval_steps=500.0, eval_delay=0, per_device_eval_batch_size=1, prediction_loss_only=False, eval_on_start=False, eval_do_concat_batches=True, eval_use_gather_object=False, eval_accumulation_steps=None, include_for_metrics=[], batch_eval_metrics=False, save_only_model=False, save_strategy=, save_steps=500, save_on_each_node=False, save_total_limit=2, enable_jit_checkpoint=False, push_to_hub=False, hub_token=None, hub_private_repo=None, hub_model_id=None, hub_strategy=, hub_always_push=False, hub_revision=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, restore_callback_states_from_checkpoint=False, full_determinism=False, seed=42, data_seed=42, use_cpu=False, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, dataloader_drop_last=False, dataloader_num_workers=1, dataloader_pin_memory=True, dataloader_persistent_workers=False, dataloader_prefetch_factor=2, remove_unused_columns=False, label_names=None, train_sampling_strategy='random', length_column_name='length', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, ddp_backend=None, ddp_timeout=7200, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, debug=[], skip_memory_metrics=True, do_train=False, do_eval=False, do_predict=False, resume_from_checkpoint=None, warmup_ratio=0.03, logging_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/runs', local_rank=0, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, safe_serialization=True, max_shard_size='5GB', check_model=True, acc_strategy='token', train_dataloader_shuffle=True, group_by_length=False, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='full', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)" +} \ No newline at end of file diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/chat_template.jinja b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01be9b307daa2d425f7c168c9fb145a286e0afb4 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/config.json b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..91fec50984b1ce69db1f04f83bf57934419cc5ac --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/config.json @@ -0,0 +1,248 @@ +{ + "architectures": [ + "LlavaOnevisionForConditionalGeneration" + ], + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "hidden_size": 2048, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_token_index": 151669, + "keys_to_ignore_at_inference": [ + "past_key_values" + ], + "model_type": "llava_onevision", + "multimodal_projector_bias": true, + "pad_token_id": 151643, + "projector_hidden_act": "gelu", + "text_config": { + "_name_or_path": "/home/tiger/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": false, + "video_token_index": 151670, + "vision_aspect_ratio": "anyres_max_9", + "vision_config": { + "attention_dropout": 0.0, + "dtype": "bfloat16", + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 384, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 26, + "patch_size": 14, + "vision_use_head": false + }, + "vision_feature_layer": -1, + "vision_feature_select_strategy": "full" +} diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/generation_config.json b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..caf77791d2c04f34887781e78a159cf8968d3fe6 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 151643, + "eos_token_id": [ + 151645, + 151643 + ], + "output_attentions": false, + "output_hidden_states": false, + "transformers_version": "5.2.0", + "use_cache": true +} diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10a2600ebb13b9afb54bbd85591363db79bde637 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c6d2bdbea71e8c9c11ec4b99346f556636c59fa028d30c2e12413d57cf4622c +size 9845509 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8733bea6a2e58b0b1af9132c8c69b927056ae655 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:967b9ec28459481e1260b00b138c7a665e6edb3c22fee2fd239fb097e2e5f409 +size 9845445 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca6206ff72db149721ac9e15ab7620873aa28e78 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:994bcd1280aa4617e138b7f92a580b3d81500cda37bac33248b33113ccfb6e63 +size 9845509 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b29cd96c87a6b0372014518e2622be6c9fccd8e --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b61f8a871858c894b5773e30c4dd377b47fb68c78f30f35b403aea1d9671b879 +size 9845445 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2017fed4dc70f75633a0fa21498eb81a86896a3 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d98a78b121826349f0ae8ddcd6bab48e93c8e89e49f00b7545f0232b401b03ef +size 9845445 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fff08083da5eab84dc18a32acd5c408a78d9eefb --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd1303b585432088ea30a3a995bd779d9afcc35a19b8d653b85f2be8c95b521d +size 9845445 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b544a612f910a14c5b2dca50d92585cb588594ae --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86491e96f456fd57fb812f0061955ebae8e902195ed38f1091778d87a800b140 +size 9845445 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57b6d057155b0a18d2e9c99c0915508ff1238e7a --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1085705577a69a7300406af3e3ff8c1524b4dd1983bd5df3a55a27c3c9638dc6 +size 9845445 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/mp_rank_00_model_states.pt b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3fee8ab004a0ecdd5ff9742d44ac4a6da8a12cd9 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/global_step2000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f0ca2104081407546e098904bb716aca9c5e5aaf4f2127597148be37f68ab88 +size 8487035043 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/latest b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/latest new file mode 100644 index 0000000000000000000000000000000000000000..2a79fdc19587e6bc9de060e90633f3a151b04516 --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/latest @@ -0,0 +1 @@ +global_step2000 \ No newline at end of file diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/model.safetensors b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..91b42ff15a2a9164ddb2e921b85fc1c5ad8337fd --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdc8251748664cce3a2fc5aa5daa11dbe3395a02ab3d1de2ad34069f58e4cb5d +size 4872193968 diff --git a/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/processor_config.json b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..01039364dd6b06f3ca0a6df00c5f16fcb79e564a --- /dev/null +++ b/video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/processor_config.json @@ -0,0 +1,205 @@ +{ + "image_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_pad": true, + "do_rescale": true, + "do_resize": true, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_processor_type": "LlavaOnevisionImageProcessor", + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "height": 384, + "width": 384 + } + }, + "image_token": "", + "num_image_tokens": 729, + "processor_class": "LlavaOnevisionProcessor", + "video_processor": { + "data_format": "channels_first", + "default_to_square": false, + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": false, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "size": { + "height": 384, + "width": 384 + }, + "video_processor_type": "LlavaOnevisionVideoProcessor" + }, + "video_token": "