diff --git a/.gitattributes b/.gitattributes index e15ca5a29b2063bb686314b00a59dc1b4ae38457..5a15a019707c24b3b3ecedf38add2aa6c262c64b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6298,3 +6298,19 @@ neuronxcc-2.21.33363.0+82129205/MODULE_cb86ed7fc724b06726b3+fb4cc044/model.neff neuronxcc-2.21.18209.0+043b1bf7/MODULE_a9ea3bcc615b10517bb2+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text neuronxcc-2.21.33363.0+82129205/MODULE_9b4ea40b364ed3edd3ad+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text neuronxcc-2.21.33363.0+82129205/MODULE_55c2fa803a7e3881cef6+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text +neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.neff filter=lfs diff=lfs merge=lfs -text diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/60feecaa0c4c075e2f3e46a3f55d9a273f0ddd75a0ecf64e4ae27352e0819506/1c4ee5d7dc71b8843fca.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/60feecaa0c4c075e2f3e46a3f55d9a273f0ddd75a0ecf64e4ae27352e0819506/1c4ee5d7dc71b8843fca.json new file mode 100644 index 0000000000000000000000000000000000000000..f9d2e5a4913a6235391b59b3be0cb4b5320c05ca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/60feecaa0c4c075e2f3e46a3f55d9a273f0ddd75a0ecf64e4ae27352e0819506/1c4ee5d7dc71b8843fca.json @@ -0,0 +1,87 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-0.6B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-0.6B", + "checkpoint_revision": "c54f2e6e80b2d7b7de06f51cec4959f6b3e03418", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 1, + "max_batch_size": 1, + "max_context_length": 8192, + "max_topk": 256, + "n_active_tokens": 8192, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 8192, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 1 + }, + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151669 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/026fe44014d3f650a32e.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/026fe44014d3f650a32e.json new file mode 100644 index 0000000000000000000000000000000000000000..a053e27463473226ceb8cdec2d194848e3d0fb94 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/026fe44014d3f650a32e.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-8B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-8B", + "checkpoint_revision": "1d8ad4ca9b3dd8059ad90a75d4983776a23d44af", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 16, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/2ad8fd5368e5c42f132c.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/2ad8fd5368e5c42f132c.json new file mode 100644 index 0000000000000000000000000000000000000000..755fc823d11ba6888e480e840cd907f6d4a515c9 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/2ad8fd5368e5c42f132c.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-8B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-8B", + "checkpoint_revision": "1d8ad4ca9b3dd8059ad90a75d4983776a23d44af", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/66877d91fb0121840163.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/66877d91fb0121840163.json new file mode 100644 index 0000000000000000000000000000000000000000..890cace77095cad9d2d1b895056f330759839a0e --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/66877d91fb0121840163.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-8B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-8B", + "checkpoint_revision": "1d8ad4ca9b3dd8059ad90a75d4983776a23d44af", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/ae5d00317f4b117f94a8.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/ae5d00317f4b117f94a8.json new file mode 100644 index 0000000000000000000000000000000000000000..5d9addba53088d0426588717880611742f91bc6d --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/ae5d00317f4b117f94a8.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-8B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-8B", + "checkpoint_revision": "1d8ad4ca9b3dd8059ad90a75d4983776a23d44af", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/d9b9f628f0dee7a926d6.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/d9b9f628f0dee7a926d6.json new file mode 100644 index 0000000000000000000000000000000000000000..b30effacb1069ca90ab0f05045c235c003e5e5d0 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/af58eb15d8e02338dc2f2e880e9c6ec803a98278914b3606acdcc252e7e18429/d9b9f628f0dee7a926d6.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-8B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-8B", + "checkpoint_revision": "1d8ad4ca9b3dd8059ad90a75d4983776a23d44af", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/07552dc6c695df3ea557.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/07552dc6c695df3ea557.json new file mode 100644 index 0000000000000000000000000000000000000000..908b2c13d2c33a7f8b5ade7c62948a0201d2f00c --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/07552dc6c695df3ea557.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 1, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 1 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/25770924bad8fff9ec23.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/25770924bad8fff9ec23.json new file mode 100644 index 0000000000000000000000000000000000000000..0d0a0f36c5327e269f03ba3f068510674f159b20 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/25770924bad8fff9ec23.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 8, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/4f788775782f89b676c9.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/4f788775782f89b676c9.json new file mode 100644 index 0000000000000000000000000000000000000000..3932f3f833b111584ace119d2e539344d8785149 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/4f788775782f89b676c9.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/7190036e8ed3be94399f.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/7190036e8ed3be94399f.json new file mode 100644 index 0000000000000000000000000000000000000000..673c2d4870ed17f69572d8f9275de2ab297228f7 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/7190036e8ed3be94399f.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 16, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/9054b94f39e5c374b6b8.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/9054b94f39e5c374b6b8.json new file mode 100644 index 0000000000000000000000000000000000000000..9de1e03e304c63b2477ffec5cdd0b4094af1cf8d --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/9054b94f39e5c374b6b8.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/bed88d82f075f516941d.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/bed88d82f075f516941d.json new file mode 100644 index 0000000000000000000000000000000000000000..4b2d01b7226c6454cfd493111c1517d8b8c53461 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/bed88d82f075f516941d.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/cf5f2cf31cc338bcdce9.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/cf5f2cf31cc338bcdce9.json new file mode 100644 index 0000000000000000000000000000000000000000..a65453ec94556a32b66056d6090db1d622d37c98 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/cf5f2cf31cc338bcdce9.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 16, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 16, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/d06a9ca97bbded610b72.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/d06a9ca97bbded610b72.json new file mode 100644 index 0000000000000000000000000000000000000000..ead06e87aa4877b52c6e670253755c2d5d2c553b --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/d06a9ca97bbded610b72.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/d7e075cbb2c6bd78d6b4.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/d7e075cbb2c6bd78d6b4.json new file mode 100644 index 0000000000000000000000000000000000000000..e8d261f284545be837731ba5e9a240a7f0467e14 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/d7e075cbb2c6bd78d6b4.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 64, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 64, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/e189289909e4808416f7.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/e189289909e4808416f7.json new file mode 100644 index 0000000000000000000000000000000000000000..d689f7f78ecdd27cc273d7bc8fafe1fe9ff03566 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/e189289909e4808416f7.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/e79a64794f75d8045060.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/e79a64794f75d8045060.json new file mode 100644 index 0000000000000000000000000000000000000000..3c2efe7e7a253e97e761c70bd8bf8af45f1ddc40 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/e0b6d1e2424243dcd9ff1755e02969dcc312d14df531d876c5c2892f285b2863/e79a64794f75d8045060.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 8, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 8, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-0.6B/1c4ee5d7dc71b8843fca.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-0.6B/1c4ee5d7dc71b8843fca.json new file mode 100644 index 0000000000000000000000000000000000000000..f9d2e5a4913a6235391b59b3be0cb4b5320c05ca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-0.6B/1c4ee5d7dc71b8843fca.json @@ -0,0 +1,87 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-0.6B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-0.6B", + "checkpoint_revision": "c54f2e6e80b2d7b7de06f51cec4959f6b3e03418", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 1, + "max_batch_size": 1, + "max_context_length": 8192, + "max_topk": 256, + "n_active_tokens": 8192, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 8192, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 1 + }, + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151669 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-4B/07552dc6c695df3ea557.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-4B/07552dc6c695df3ea557.json new file mode 100644 index 0000000000000000000000000000000000000000..908b2c13d2c33a7f8b5ade7c62948a0201d2f00c --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-4B/07552dc6c695df3ea557.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-4B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-4B", + "checkpoint_revision": "5cf2132abc99cad020ac570b19d031efec650f2b", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 1, + "max_batch_size": 1, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 1 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-8B/d9b9f628f0dee7a926d6.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-8B/d9b9f628f0dee7a926d6.json new file mode 100644 index 0000000000000000000000000000000000000000..b30effacb1069ca90ab0f05045c235c003e5e5d0 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/qwen3/Qwen/Qwen3-Embedding-8B/d9b9f628f0dee7a926d6.json @@ -0,0 +1,95 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "Qwen/Qwen3-Embedding-8B", + "_task": "feature-extraction", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 32, + "capacity_factor": null, + "checkpoint_id": "Qwen/Qwen3-Embedding-8B", + "checkpoint_revision": "1d8ad4ca9b3dd8059ad90a75d4983776a23d44af", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 8, + "max_batch_size": 32, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": false, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 8 + }, + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151665 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/588f7836eb16c9483d90.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/588f7836eb16c9483d90.json new file mode 100644 index 0000000000000000000000000000000000000000..2e6f473d6b2048c556b258aaa8afd2cd9e2bc2e3 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5.dev2/smollm3/HuggingFaceTB/SmolLM3-3B/588f7836eb16c9483d90.json @@ -0,0 +1,134 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "HuggingFaceTB/SmolLM3-3B", + "_task": "text-generation", + "architectures": [ + "SmolLM3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 11008, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 65536, + "max_window_layers": 28, + "mlp_bias": false, + "model_type": "smollm3", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 1, + "capacity_factor": null, + "checkpoint_id": "HuggingFaceTB/SmolLM3-3B", + "checkpoint_revision": "a07cc9a04f16550a088caea529712d1d335b0ac1", + "continuous_batching": false, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 1, + "max_context_length": 1024, + "max_topk": 256, + "n_active_tokens": 1024, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.5.dev2", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 1024, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "no_rope_layer_interval": 4, + "no_rope_layers": [ + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 1, + 0 + ], + "num_attention_heads": 16, + "num_hidden_layers": 36, + "num_key_value_heads": 4, + "pretraining_tp": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 5000000.0, + "sliding_window": null, + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5/cf6b9a360dcf294104671106bae2adbd9fd291823bb60a351883163684073231/22277b72a5862009f452.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5/cf6b9a360dcf294104671106bae2adbd9fd291823bb60a351883163684073231/22277b72a5862009f452.json new file mode 100644 index 0000000000000000000000000000000000000000..1f153b07fb8bde9aa288f76b2b9d6a4f85d49b09 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5/cf6b9a360dcf294104671106bae2adbd9fd291823bb60a351883163684073231/22277b72a5862009f452.json @@ -0,0 +1,63 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "unsloth/Llama-3.2-1B-Instruct", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct", + "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c", + "continuous_batching": true, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.5", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5/llama/unsloth/Llama-3.2-1B-Instruct/22277b72a5862009f452.json b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5/llama/unsloth/Llama-3.2-1B-Instruct/22277b72a5862009f452.json new file mode 100644 index 0000000000000000000000000000000000000000..1f153b07fb8bde9aa288f76b2b9d6a4f85d49b09 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/0_REGISTRY/0.4.5/llama/unsloth/Llama-3.2-1B-Instruct/22277b72a5862009f452.json @@ -0,0 +1,63 @@ +{ + "_entry_class": "SingleModelCacheEntry", + "_model_id": "unsloth/Llama-3.2-1B-Instruct", + "_task": "text-generation", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "neuron": { + "_serialized_key": "NxDNeuronConfig", + "batch_size": 4, + "capacity_factor": null, + "checkpoint_id": "unsloth/Llama-3.2-1B-Instruct", + "checkpoint_revision": "5a8abab4a5d6f164389b1079fb721cfab8d7126c", + "continuous_batching": true, + "ep_degree": 1, + "fused_qkv": true, + "glu_mlp": true, + "local_ranks_size": 2, + "max_batch_size": 4, + "max_context_length": 4096, + "max_topk": 256, + "n_active_tokens": 4096, + "neuronxcc_version": "2.21.33363.0+82129205", + "on_device_sampling": true, + "optimum_neuron_version": "0.4.5", + "output_logits": false, + "pp_degree": 1, + "sequence_length": 4096, + "speculation_length": 0, + "start_rank_id": 0, + "target": "trn1", + "torch_dtype": "bfloat16", + "tp_degree": 2 + }, + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "unsloth_fixed": true, + "use_cache": true, + "vocab_size": 128256 +} \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..8f9a4e0eaa80162b7c6f367c963981a2d5c86b9e --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c252885b17164c4e4bf832f391aba7f3d8c7139bb59b1d4ee6df54c77398869 +size 840640 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..705a9ef34c5b5609dbac60a42b0fb67ab0b00d27 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_14268084bdb93e2af8ea+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c01b0bcf9bc956fe6facbe1b5c2a47bb487a061ab522c3a8cb46305bae28028f +size 54754304 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..ca712d55ae0e352f512c4688caf690116ec5c552 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f061fa115ae7cc4aa283bb767164180d1817ccea49eb05c899e7f4d05b03a1db +size 618697 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..0c407982ad6ab5500b85d41d030c0da0e25b72fe --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_17b4453648b482087f44+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:116d5e8b16d09e647e218ae20fe1a255681d13daa59e28082c5e5ab4a7a8e03f +size 8070144 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..aae092f7271c8cdaefb0bc7887ea378e516368ed --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a917908cdedb3336190bdbfa5dc4c0b006052c9672b3c51ee0aa7c7a9e1e439 +size 628622 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..1f0624a7bfa513fd903b98a7bc8b836714b7f64c --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_1efaefc590ce7ee07e97+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cdc725c9067ddd09df8d0768aa1744294c6717afb9cb332d40ec2fa3d13f86d +size 2888704 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..fd0200c89032660ad01908166c47c54be46ec27f --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4bfd87783375e038bf25e0e2d8810154384e5f8e2e73640820a95ef156e16d6 +size 793180 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..bb85cc03f88d364116802323ce4c5f38f87b57f5 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_2828c4ae6bc360cc555f+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:121a4a45c22c23eaa0eb9bc94dcd4f162557e2c0574c2f9eefb1d7bcd069895b +size 242791424 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_2da63caa31e7595bc07f+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_2da63caa31e7595bc07f+fb4cc044/model.neff index 899a56b2ef4bb268bc3455d95df44d15c85f4db2..0d236132b203efa0292be95377b2a1813256e9ee 100644 --- a/neuronxcc-2.21.33363.0+82129205/MODULE_2da63caa31e7595bc07f+fb4cc044/model.neff +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_2da63caa31e7595bc07f+fb4cc044/model.neff @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd5dc789465f3d32882920308f3d4a712edf866512104c7effa624cfea21354d +oid sha256:b096fd09961ede9fc78f22a88f39e2023b064d98e4ecf8cae48c55a4e5c7b80e size 18473984 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..2f9adf4961ede24f992785a34b1d3e6fb050c158 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8270a9ede2522ef5f9691a06f9f5622f5923a5b4344b00ac1bbe7b10b666e6de +size 838840 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..2983f4588802feee07c78ff3af5bde590d52caf2 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_32f3df92f722f8d61840+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7237e792db77272b7eadefed43ad0b601d2d5eb78ea028a48d5a0a244ba47466 +size 26809344 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d066edbb58eb14ee536aee4b15890e6fa317d4dd --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a72c26ea4bfff05052405e8a360c5ead4dc213d746221968e452c3adbed7964 +size 618481 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..c078d66c559f4e0465a08cad1ca7e1f964285107 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_3683fa1d292af356adae+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2413f921e113dbc56d84a84a618ea2db978d407f5460463124a47a05840ab24 +size 4271104 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d40b852174adeda38245368059dae82c53a40e02 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f70fc236e9f92ab1fa0421a47be449b926660a67784b275927e8cb66dca0e39 +size 618697 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..08e82719ffd4588c36d505d96fc002d92dccf3ed --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_4effee1c1788c6eeab78+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:848f4efccf68564350550cac06b3729cabafd47692f4df18f5fe63748eb695b8 +size 10394624 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d42c3ba0d36b5dc258e2f260e64f760f19e0460f --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd7f7b1a2558219081ab0d313a1278db0fdf3a960970d5663b476de30e8725d +size 618481 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..3a6abfae04b6e30b82a4f27a9ff8e4c35044e792 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_4fcdffc44bb528a047f0+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef96d9b1de73a404b91e5a05677cccf742bcd9a3bda777340488acad29b97622 +size 5346304 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d4f5e4cb020cf2eed8a0567ff99aed07b4bcb061 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:232afcad03e2df34f698ed0701ed5e2072121afca9edb009d66a9aec12f1a379 +size 840640 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..f8b770fc8566507a864bbcdf97c1c1b30637fe37 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_5fdd7ad15e00da4fcd3e+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:296c2002b93c609d4a68eb22ac8c2afc5385391ddafe81315c177874ccdea5df +size 45046784 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..cdfd002b4ff01bf797e8bb61f8f7cae4e1bded36 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7771e8b633a25dad02aa999b81083c02c2cb7cc63b735ab794009698e7dd6524 +size 628810 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..d68cb1a51a233a6f2c6ad89f1741e5caff2b5de7 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_78602cd5234279501590+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02c1c244bc010e08ec988eee7ae622e88cf92b80df814adb48ab2d853395796 +size 22672384 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d735d5f93ef7e58283d5ccadc6065ea413da1269 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5485875eb0c496ff7c3ecea5e35812074c14b25db1f04586a6e66b8d50cd7 +size 810056 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/model.log b/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/model.log new file mode 100644 index 0000000000000000000000000000000000000000..ded2158bce7dc066a77620081921c11f58549eae --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_8508fced3b142be2a2ee+fb4cc044/model.log @@ -0,0 +1,53 @@ +Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/encoding/_tp0_bk0/model.MODULE_8508fced3b142be2a2ee+fb4cc044.hlo_module.pb', '--output', '/tmp/nxd_model/encoding/_tp0_bk0/model.MODULE_8508fced3b142be2a2ee+fb4cc044.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: 2026-02-09T10:25:27Z +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 5171 + convert 910 17.60% ################################################################ + reshape 802 15.51% ######################################################## + transpose 723 13.98% ################################################## + broadcast 548 10.60% ###################################### + slice 543 10.50% ###################################### + multiply 362 7.00% ######################### + parameter 328 6.34% ####################### + constant 221 4.27% ############### + call 217 4.20% ############### + dot 181 3.50% ############ + add 144 2.78% ########## + concatenate 74 1.43% ##### + negate 72 1.39% ##### + get-tuple-element 37 0.72% ## + iota 3 0.06% + gather 2 0.04% + tuple 1 0.02% + cosine 1 0.02% + reduce 1 0.02% + sine 1 0.02% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 4140 + convert 909 21.96% ################################################################ + reshape 650 15.70% ############################################# + transpose 540 13.04% ###################################### + parameter 328 7.92% ####################### + constant 256 6.18% ################## + broadcast 255 6.16% ################# + slice 252 6.09% ################# + custom-call 217 5.24% ############### + multiply 217 5.24% ############### + dot 180 4.35% ############ + add 144 3.48% ########## + concatenate 74 1.79% ##### + negate 72 1.74% ##### + get-tuple-element 37 0.89% ## + iota 3 0.07% + gather 2 0.05% + cosine 1 0.02% + tuple 1 0.02% + reduce 1 0.02% + sine 1 0.02% + +Potential split-points stats: #CC 0 #AR 0 #AG 0 #BN 0 nClamp 0 +WARNING: Insufficient number of potential split points found. Entire model will be compiled as a single module. +No partitions found. Compiling as flat model +2026-02-09 10:25:27.146344: F hilo/hlo_passes/NeuronHloVerifier.cc:504] [ERROR] [NCC_VRF007] Tiled instruction count 3784872448 exceeds 5000000. TIP: Input HLO might be too big, please consider using smaller batches, applying model parallelism or compile under --optlevel=1 to create smaller subgraphs + diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_88190e1b8a8ceb313e53+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_88190e1b8a8ceb313e53+fb4cc044/model.neff index 4eb7ed3a8f296ac58dd32e41668b8d4714e930c7..a00d63e822d6c03474c6bbda72034d9e3286eafd 100644 --- a/neuronxcc-2.21.33363.0+82129205/MODULE_88190e1b8a8ceb313e53+fb4cc044/model.neff +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_88190e1b8a8ceb313e53+fb4cc044/model.neff @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ca1943286c8e972bc0090e3fe3ca6935aab857bda4db85e1bd362d544cd43c6 +oid sha256:9e31980eef984a5dca7919bec01552bcd34f96c43a1391d0fb74dc4a5a650955 size 27689984 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d46e4cd3a8d58cdb0f73e8095b1e4d81cda74bc8 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35843e5fb4c876382538c2c296011d3e12c4bf8891f533afacc85305fa7369f7 +size 628841 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..20f0fa3038c73d1071750374d7b6a923f48094ff --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_8badd6f6eb69fa108ac8+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c058612fa664928eaf60fd5aa93e68c147886daf2c056edf90b378cc4a307b6 +size 10128384 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..390c9e8125947f5e84acb388aa326f3d1c3cbef5 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8fa7c229bd926ce09a4a20474350250e01123d6d2cd8a768ee6de4473861c1d +size 840640 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..b0c17d1a9efbc7da661af8aab7758a0b1855640d --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_93b5314f91b367ce312a+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ce410eef7e8b7a3ea20cf29705cdb146065ea094acf50f0dccad6705a67d738 +size 88853504 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..338ad239791970a8058a5f26bd7a779f46002928 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f4a5d873782f0c2ddb0a99918ed821635ea8686356af4e9b834e35ca79e8936 +size 838696 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..03d9b31fb781c47fd53182d122e17615393e0903 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_9c01f09fb84f2b339fc6+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:248699dfced4324a86f270996c16bae9270f8ae24e4989704ec9b6e3b3b3d6c1 +size 11531264 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..09ca2127301827763e6b1a36e2411f5107ae5e17 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d56d0facb8ba4b04314bfea029cf93e012552fc164e0e5e04053fb9971cc9f9 +size 628838 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..d13b05182b679f9480810608ca8ab2b8bae1c44b --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_a08dd31a7a105fa45df2+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2703684352bbea1321e6c16c42bf42e61d68b745049074d232727ad247acb0 +size 5284864 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..a58d0a283b25107462046c3902d171acffe58d4c --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35df9c1b5244d0e9c581c6cee6799f0b7a9a5b237400d5afebb1a523cd29a078 +size 838840 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..91cfec5510627e8559f425c88f12856c856f2876 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_c913669652fcaa9d5638+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9396b601e6f597a226f6927edd90832616232783e20c1976cd34321a9c65fdb2 +size 22795264 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.done b/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..77904b171c7b5146c3d2e6f049bcd63c61520983 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53b4f366b8d782dd4d1d1d3bb864ff53306cb866cc8a1867efc3fbb2aa112d8d +size 617945 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.neff b/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.neff new file mode 100644 index 0000000000000000000000000000000000000000..78834622890200b3956caa04dc82a18eec841763 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_d3f277cf573c91b9ded8+fb4cc044/model.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a6408a53c92a01df4a363d200f1754ac088fb38e45c17f2e3ff5edd31aff30e +size 200131584 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..aef42dbeb26b04f856f84f1bae494446196aa05e --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:412d9298a727c88ef811809b3cf48655cd25ce8300fe430c2187da886fffadf2 +size 793324 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/model.log b/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/model.log new file mode 100644 index 0000000000000000000000000000000000000000..6abc610510f70b376d8f5026c3ce1159e83d91ba --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_f5f81a13c8e6671b9b65+fb4cc044/model.log @@ -0,0 +1,53 @@ +Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/encoding/_tp0_bk0/model.MODULE_f5f81a13c8e6671b9b65+fb4cc044.hlo_module.pb', '--output', '/tmp/nxd_model/encoding/_tp0_bk0/model.MODULE_f5f81a13c8e6671b9b65+fb4cc044.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: 2026-02-04T14:17:28Z +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 5170 + convert 910 17.60% ################################################################ + reshape 802 15.51% ######################################################## + transpose 723 13.98% ################################################## + broadcast 547 10.58% ###################################### + slice 543 10.50% ###################################### + multiply 362 7.00% ######################### + parameter 328 6.34% ####################### + constant 221 4.27% ############### + call 217 4.20% ############### + dot 181 3.50% ############ + add 144 2.79% ########## + concatenate 74 1.43% ##### + negate 72 1.39% ##### + get-tuple-element 37 0.72% ## + iota 3 0.06% + gather 2 0.04% + tuple 1 0.02% + cosine 1 0.02% + reduce 1 0.02% + sine 1 0.02% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 4364 + convert 909 20.83% ################################################################ + reshape 870 19.94% ############################################################# + transpose 543 12.44% ###################################### + parameter 328 7.52% ####################### + constant 257 5.89% ################## + broadcast 256 5.87% ################## + slice 252 5.77% ################# + custom-call 217 4.97% ############### + multiply 217 4.97% ############### + dot 180 4.12% ############ + add 144 3.30% ########## + concatenate 74 1.70% ##### + negate 72 1.65% ##### + get-tuple-element 37 0.85% ## + gather 2 0.05% + iota 2 0.05% + cosine 1 0.02% + tuple 1 0.02% + reduce 1 0.02% + sine 1 0.02% + +Potential split-points stats: #CC 0 #AR 0 #AG 0 #BN 0 nClamp 0 +WARNING: Insufficient number of potential split points found. Entire model will be compiled as a single module. +No partitions found. Compiling as flat model +2026-02-04 14:17:28.019879: F hilo/hlo_passes/NeuronHloVerifier.cc:504] [ERROR] [NCC_VRF007] Tiled instruction count 7392334 exceeds 5000000. TIP: Input HLO might be too big, please consider using smaller batches, applying model parallelism or compile under --optlevel=1 to create smaller subgraphs + diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/compile_flags.json b/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/compile_flags.json new file mode 100644 index 0000000000000000000000000000000000000000..07bdc1045dd850da0b9d66d697f3755e9be37aca --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/compile_flags.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "-O2", "--lnc=1", "--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/model.hlo_module.pb b/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/model.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..406d26205a62d7bc1c6466b1bf413167a23cee9e --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/model.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7969893711a56172d92c4bb78eee80280687230fb1c3c29db069f04bf97396e +size 793324 diff --git a/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/model.log b/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/model.log new file mode 100644 index 0000000000000000000000000000000000000000..b3682c390188d6ce33fc93474fb72a0a54b8bcc0 --- /dev/null +++ b/neuronxcc-2.21.33363.0+82129205/MODULE_fb54563033b3f69f79b9+fb4cc044/model.log @@ -0,0 +1,3 @@ +Failed compilation with ['neuronx-cc', 'compile', '--framework=XLA', '/tmp/nxd_model/encoding/_tp0_bk0/model.MODULE_fb54563033b3f69f79b9+fb4cc044.hlo_module.pb', '--output', '/tmp/nxd_model/encoding/_tp0_bk0/model.MODULE_fb54563033b3f69f79b9+fb4cc044.neff', '--target=trn1', '--auto-cast=none', '--model-type=transformer', '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ', '-O2', '--lnc=1', '--logfile=/tmp/nxd_model/encoding/_tp0_bk0/log-neuron-cc.txt', '--verbose=35']: [LUR015] Compiler generated too many instructions (9816278). This maybe due to a failure in parallelism extraction by the tensorizer. - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. +2026-02-04T14:25:22Z Non-signal exit. Backend exited with code 1 and stderr: [LUR015] Compiler generated too many instructions (9816278). This maybe due to a failure in parallelism extraction by the tensorizer. - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new. You may also be able to obtain more information using the 'XLA_IR_DEBUG' and 'XLA_HLO_DEBUG' environment variables. +