MusYW commited on May 26

Commit

7f0c6e6

verified ·

1 Parent(s): 4d5676b

Training in progress, step 500

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +19 -0
added_tokens.json +28 -0
config.json +32 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +31 -0
tmpcnl2zetw/__pycache__/_remote_module_non_scriptable.cpython-312.pyc +0 -0
tmpcnl2zetw/_remote_module_non_scriptable.py +81 -0
tmpfehyc297/_remote_module_non_scriptable.py +81 -0
tokenizer.json +3 -0
tokenizer_config.json +240 -0
torchinductor_ch-epfl-345354-j/2f/8f4778b9c2bdc504a3c5d1b5bc09dac279f18294d78f93a4c581178508bcf83b.best_config +1 -0
torchinductor_ch-epfl-345354-j/2f/c2fzymhcr3rme5dtns3jvvyp6x3osfrlp7cc7zg7igwzigqmmg65.py +46 -0
torchinductor_ch-epfl-345354-j/2r/717267c6902a1a61a8cc50b68a007cec3f90a0241185b112347df8b34fa8c605.best_config +1 -0
torchinductor_ch-epfl-345354-j/2r/c2rhxwmxh62lojowjb65g6mbzowlwbjcacwjmn3vu63z4qatxuo3.py +26 -0
torchinductor_ch-epfl-345354-j/2y/c2ykxnj2iqrpp4u3ihziotcanxl3tc27h7ajzahx5wypy4anuhuj.py +88 -0
torchinductor_ch-epfl-345354-j/3m/c3mt4utggpr6zcsqyeele6646fofhvyk4xxtwll4gqqa5w6nrbct.py +55 -0
torchinductor_ch-epfl-345354-j/43/c43m5ctxi7dcy4hjgz5jijzo4xp7fp3bmvzcjp3ygmirxptgoerd.py +53 -0
torchinductor_ch-epfl-345354-j/4i/c4iarmybewwgyq7pa6izmajgs66hg4cgb6yhmezt4tg6j77oklfi.py +50 -0
torchinductor_ch-epfl-345354-j/53/c53mrwlx5sxivgg5x5z6kkaldo2q5yn2pjsymcv27tpzj2cdoeww.py +66 -0
torchinductor_ch-epfl-345354-j/56/c56q66j66nfzeu5puvuhal4wt2foih6rnb5nwmqolafn3iq33kjp.py +66 -0
torchinductor_ch-epfl-345354-j/57/c573irrqes6p6it4yfyvqw2efgfbbgp7yjzxjjxq5jpeesj3bi77.py +353 -0
torchinductor_ch-epfl-345354-j/57/c574kngiopy3pgespyoupnzlae4d5tokyeui7uglwglnym2qijvn.py +30 -0
torchinductor_ch-epfl-345354-j/57/dad7be19dc394c1e08368515640dff88b78797aaabae100a15a1f195476a9a87.best_config +1 -0
torchinductor_ch-epfl-345354-j/5e/c5enonf6qztlsw7dozsqkejk4exzt4n56gbz6fiey2gnus5vdf76.py +66 -0
torchinductor_ch-epfl-345354-j/5x/c5xsvywggx5vrzm2l5uaktu7pipclhdn5h6263yru2ugvuhe2nak.py +57 -0
torchinductor_ch-epfl-345354-j/6d/c6dsbxlebwjqawzeprkq3lkldtxoiept4c6bpgtva5r4mjlrnwlr.py +229 -0
torchinductor_ch-epfl-345354-j/6j/7c215475e7b40a21cf286026270965eb7f07e7c3af1c4052d331de3f74c6449e.best_config +1 -0
torchinductor_ch-epfl-345354-j/6j/c6j5lx5qgycfvyi3dm5f4mo3ssluzzsrmdq32pka7e6pyhg42zvd.py +499 -0
torchinductor_ch-epfl-345354-j/6j/c6jqjdux4scc3alxlsrcpnhemegj7ym5pw3twg6xb2eyx4codkvz.py +40 -0
torchinductor_ch-epfl-345354-j/7a/c7a4b5izank2343xz4473c4igojrrhlfxb5ulctqd32qrtkreq3m.py +42 -0
torchinductor_ch-epfl-345354-j/7a/ddaeab32a9175f6d14ae7329f3defe09537605c41fa4d35da5bf9cbac1616b91.best_config +1 -0
torchinductor_ch-epfl-345354-j/7q/c7qudnwq7tyfwnepjsm2ilmratxdwkx4euvow7brbvrfif7hgnwh.py +324 -0
torchinductor_ch-epfl-345354-j/a6/ca64rxymdowafnowfq53ckfynl3yei5mmfkeefu6f6xndlg3ukok.py +200 -0
torchinductor_ch-epfl-345354-j/aotautograd/acxk7xhb35e5myvrfk4m2smos5f3rwybegalnbqbgtl3ghlaw2vw/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/adw7o5w6jucvlwdu4mn3nk52nno5z3lt73pmvaksrn3cahxlwc5t/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/agm67xcx3b2ejeqf3t422b43zsalmtzgitagqmb4kcd76dzg2sr6/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/ahinqqlnserz457jqclv2vjeogmqix7jcrylpuhbc64kw4k3apfy/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/ahji7b2arusm47q6ox5itjvurtws6r6kls2kskgxfnc2rqm4ojdg/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/aibbpzcrlnv7lrbehiaaab4olrvijekv6m46vdzzqh3tbnvnl67m/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/aig3hpjgzj7f27hhdphh7ozndqiwpruhugzjsiwyog75fn4y3rbj/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/algm7vsngjdke6rmqon76peuppnhsp625k5d4zxnwgwdbdueo4ay/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/amtpnp6cq6z6ddoun3fwe4zemhgpsp5jicklj6cf3qzsd3xbdeps/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/aqskia64x2j4xks7dhp5cpq52le5j6js6ghxfhlvw7gfa6qr6stx/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/aub73aicaqeihl4qdqbrvljzl2qxdzyu52zezeket676qt3pkgwk/entry +0 -0
torchinductor_ch-epfl-345354-j/aotautograd/aypob3g4nwzt66m7ur252rhjjobqgnn4hvdhagr4474twkamikxg/entry +0 -0
torchinductor_ch-epfl-345354-j/bm/cbmn253c3hy77ciw3f6meqi4bsbiio5zhw7hra5np6k5jyjqetnp.py +29 -0
torchinductor_ch-epfl-345354-j/ce/cceyvpztlniy45jdq6sxx7o44obzjinfuxgsvnlhcr3hjdvmek73.py +38 -0
torchinductor_ch-epfl-345354-j/cf/ae1632ffa009afdc4d40d5477a8e2ffd544972ad9ddf0c636c451826b3219579.best_config +1 -0
torchinductor_ch-epfl-345354-j/cf/ccfnt2f53rlwauznvnabnitvjchbzg7at22w4x4fskqzmyirxuxq.py +50 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/5s/f5sdzjvbwcmgigljts5qiy6lpxvzqdph6wzu6phn3y3ibrcaorli/khjgqt4e4qpxv6eeg3vxxgpwqsv3grytkxtje4324jtolexrezo filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/6x/f6x5yse7q3r4kgrx3mzcmr7b2m72jtxrtziweqwk5lposwvr3y52/wqdd5re3wcyjqm2yolzwsoaqo7xdfcnam6vnnh7oy4krfpy35mu filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/ap/fap4jdkhbhos2zlgzy4vqldjzu7uaf3wuhatrkr2kcwc42gvg2yz/2zrvkpixflxwoduo4mf45p672nlwxrnyfi7jiaahhn2lw6eafh7 filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/ay/fay24mgdmhudth5v4jelopeu2revquda5zewn436r6biwrnpgabo/rua4hs554jun3xlx2vr5pka3oqdzq77wfoxhvymrffnwxy27nbt filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/be/fbei2uvrpqs44s3zmuwyjn6byfolnsqoa7juh23nj5xwvypzh4qm/isndyt5bxypcz3icmz7sxxnxnv5k5flzys3ydjieaujxkpv2by2 filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/dz/fdz6ntzqbiwadat6ybb6wx5r72slohcdcufolrfguxpyy72ha4k3/6wwa6mdlxwrojkbhhttg3r434evsqlaub3wfx47anl25o536b6x filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/el/felw7pslqa4fo6ex4wmphdc2ybuhij3y7nr6ek7weyw6exi5n6un/vtvnvlc4u5nq7hdpjcieofwiwev7pvczdillvjg3yxva6aojzbt filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/io/fiop3vsjr3dy3bv5akbf4igspko5bzgikfs2c5r65fwccfuvu7ux/qzhalrsuhi7mryjabm3iq6rhxlb5ozesgq6j7k5e7iyja2k7vsq filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/kl/fklqttybh3jmeqtx6bvvj4haqgxnjqufqyoz2nks5uigm4r36cx4/hxqz22nepqoek7pc76k5r3icp4zjsh4kaue55ohl6xt6tbvp4oi filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/lg/flg4hgizerwmptq2h73kr43ahriht7psoidazshg6qzgrnt647t2/pijpcpasxrep34djgpqcasekfyyagfa4g3bjwvevt3aj3ofuu6q filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/ng/fngku4s6qemdyw2pe5ve3jmj4j3kwxfgrfoqpednglkm5rrggltm/gpu6deqxpscuw7qomk5b3bn5nhb7huhwwde46fignb66f72pvmw filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/ro/froa33wtzy5mq3utlkybs5daxqnv2apit6smxhgb22cf4rr7fysv/rvynlwjqk2pli2qmjywzpal7ly5ufkhjhxzh4v7y5dgjunk5bmh filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/tm/ftmxrirms7cnlnwoxoz62nacrceoyhhzhbnuwmnbxdx426kuxbca/nmw3t6ssgbvfrpuhau55inf3woabflb6qc3ynybm53y42slvkos filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/tu/ftuiasjkco5eqbhoc3ebj57kfzkbkbzglhq2wrkg4hjpkhixsfbl/qpwpe3w2vqzxprimhpukbj46zopeocm5rat5f5apx7ghtoyyi67 filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/fxgraph/xb/fxbygtwj5f3jfikmhfoz7vd3mtceabjyw5ggscr4lin73qu5vibc/typ73hul4yzbqjyctyc3spof5hr4okcdyp27oj4p5lpfovus76n filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/triton/0/35ZDQUBPR56EFY64BDG6UB7OKJODSPJPYCTXXIKSOWZ2CA3EPWGA/triton_poi_fused__to_copy_cos_mul_sin_1.cubin filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/triton/0/SS6ERMBZYPQUFPYGWYCICDOLY35WYNXDIHS3C45P5EZJOFLNB5SQ/triton_poi_fused__to_copy_cos_mul_sin_1.cubin filter=lfs diff=lfs merge=lfs -text
+torchinductor_ch-epfl-345354-j/triton/0/YGRXAKJ5T6UCGDY6RY3GMSSD4JVJFIRM6WYZVP7IT63SMM7NWHSA/triton_poi_fused__to_copy_cos_mul_sin_1.cubin filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151654,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "unsloth_fixed": true,
+  "unsloth_version": "2025.5.7",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26045c44c8f7b94239544f69255de30ea5e05c7f8e23f0c01a67e755eaa0beba
+size 1192135096

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|vision_pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tmpcnl2zetw/__pycache__/_remote_module_non_scriptable.cpython-312.pyc ADDED Viewed

Binary file (2.6 kB). View file

tmpcnl2zetw/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

tmpfehyc297/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,240 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|vision_pad|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

torchinductor_ch-epfl-345354-j/2f/8f4778b9c2bdc504a3c5d1b5bc09dac279f18294d78f93a4c581178508bcf83b.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 50}

torchinductor_ch-epfl-345354-j/2f/c2fzymhcr3rme5dtns3jvvyp6x3osfrlp7cc7zg7igwzigqmmg65.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_cat_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_cat_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x4 = xindex
+    x0 = (xindex % ks0)
+    x2 = ((xindex // ks1) % ks2)
+    x5 = xindex // ks0
+    tmp0 = tl.load(in_ptr0 + (x4), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp17 = tl.load(in_ptr2 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tmp0 * tmp1
+    tmp3 = x0
+    tmp4 = tl.full([1], 0, tl.int64)
+    tmp5 = tmp3 >= tmp4
+    tmp6 = ks0 + (-1)*(ks0 // 2)
+    tmp7 = tmp3 < tmp6
+    tmp8 = tl.load(in_ptr0 + (ks0*x5 + (ks0 // 2) + (x0)), xmask & tmp7, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp9 = -tmp8
+    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
+    tmp11 = tl.where(tmp7, tmp9, tmp10)
+    tmp12 = tmp3 >= tmp6
+    tmp13 = ks0
+    tmp14 = tmp3 < tmp13
+    tmp15 = tl.load(in_ptr0 + (ks0*x5 + (x0 + ((-1)*ks0) + (ks0 // 2))), xmask & tmp12, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp16 = tl.where(tmp7, tmp11, tmp15)
+    tmp18 = tmp16 * tmp17
+    tmp19 = tmp2 + tmp18
+    tl.store(out_ptr0 + (x4), tmp19, xmask)

torchinductor_ch-epfl-345354-j/2r/717267c6902a1a61a8cc50b68a007cec3f90a0241185b112347df8b34fa8c605.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 256, "num_warps": 4, "num_stages": 1, "configs_hash": "1b2cc4dbebb9680d3ce31843331593b159e4046c056f195ca1ccf2464d5b37d1", "found_by_coordesc": false, "time_taken_ms": 81}

torchinductor_ch-epfl-345354-j/2r/c2rhxwmxh62lojowjb65g6mbzowlwbjcacwjmn3vu63z4qatxuo3.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 2048},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*i64', 'out_ptr0': '*fp32', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_nll_loss_backward_3', 'mutated_arg_names': ['out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_nll_loss_backward_3(in_ptr0, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask)
+    tl.device_assert(((0 <= tmp0) & (tmp0 < ks0)) | ~(xmask), "index out of bounds: 0 <= tmp0 < ks0")
+    tmp2 = -1.0
+    tl.store(out_ptr0 + (tmp0 + ks0*x0), tmp2, xmask)

torchinductor_ch-epfl-345354-j/2y/c2ykxnj2iqrpp4u3ihziotcanxl3tc27h7ajzahx5wypy4anuhuj.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 1, 'r0_': 2048},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*i64', 'in_ptr1': '*i64', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'in_ptr7': '*fp32', 'in_ptr8': '*fp32', 'in_ptr9': '*fp32', 'out_ptr1': '*i1', 'out_ptr2': '*i64', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'xnumel': 'constexpr', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_add_nll_loss_backward_nll_loss_forward_15', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 10, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused_add_nll_loss_backward_nll_loss_forward_15(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, in_ptr8, in_ptr9, out_ptr1, out_ptr2, ks0, ks1, ks2, ks3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 1
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    _tmp28 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_0 = r0_index
+        tmp5 = tl.load(in_ptr1 + (r0_0 + 6*((6 + ks0*ks1) // 7)), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp19 = tl.load(in_ptr3 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp21 = tl.load(in_ptr4 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp0 = ((r0_0 + 6*((6 + ks0*ks1) // 7)) % ks1)
+        tmp1 = (-1) + ks1
+        tmp2 = tmp0 == tmp1
+        tmp3 = tmp0 < tmp1
+        tmp4 = tl.load(in_ptr0 + (tl.broadcast_to(1 + r0_0 + 6*((6 + ks0*ks1) // 7), [XBLOCK, R0_BLOCK])), r0_mask & tmp3, eviction_policy='evict_first', other=0.0)
+        tmp6 = tl.where(tmp3, tmp4, tmp5)
+        tmp7 = tl.full([1, 1], -100, tl.int64)
+        tmp8 = tl.where(tmp2, tmp7, tmp6)
+        tmp9 = tmp8 != tmp7
+        tmp10 = tl.full([1, 1], 0, tl.int64)
+        tmp11 = tl.where(tmp9, tmp8, tmp10)
+        tmp12 = ks2
+        tmp13 = tmp11 + tmp12
+        tmp14 = tmp11 < 0
+        tmp15 = tl.where(tmp14, tmp13, tmp11)
+        tl.device_assert(((0 <= tmp15) & (tmp15 < ks2)) | ~(r0_mask), "index out of bounds: 0 <= tmp15 < ks2")
+        tmp17 = tl.load(in_ptr2 + (tmp15 + ks2*r0_0 + 6*ks2*((6 + ks0*ks3) // 7)), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp20 = tmp18 - tmp19
+        tmp22 = tl_math.log(tmp21)
+        tmp23 = tmp20 - tmp22
+        tmp24 = -tmp23
+        tmp25 = 0.0
+        tmp26 = tl.where(tmp9, tmp24, tmp25)
+        tmp27 = tl.broadcast_to(tmp26, [XBLOCK, R0_BLOCK])
+        tmp29 = _tmp28 + tmp27
+        _tmp28 = tl.where(r0_mask, tmp29, _tmp28)
+        tl.store(out_ptr1 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp9, r0_mask)
+        tl.store(out_ptr2 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp11, r0_mask)
+    tmp28 = tl.sum(_tmp28, 1)[:, None]
+    tmp30 = tl.load(in_out_ptr0 + (0))
+    tmp31 = tl.broadcast_to(tmp30, [XBLOCK, 1])
+    tmp34 = tl.load(in_ptr5 + (0))
+    tmp35 = tl.broadcast_to(tmp34, [XBLOCK, 1])
+    tmp37 = tl.load(in_ptr6 + (0))
+    tmp38 = tl.broadcast_to(tmp37, [XBLOCK, 1])
+    tmp40 = tl.load(in_ptr7 + (0))
+    tmp41 = tl.broadcast_to(tmp40, [XBLOCK, 1])
+    tmp43 = tl.load(in_ptr8 + (0))
+    tmp44 = tl.broadcast_to(tmp43, [XBLOCK, 1])
+    tmp46 = tl.load(in_ptr9 + (0))
+    tmp47 = tl.broadcast_to(tmp46, [XBLOCK, 1])
+    tmp32 = 0.0
+    tmp33 = tmp31 + tmp32
+    tmp36 = tmp33 + tmp35
+    tmp39 = tmp36 + tmp38
+    tmp42 = tmp39 + tmp41
+    tmp45 = tmp42 + tmp44
+    tmp48 = tmp45 + tmp47
+    tmp49 = tmp48 + tmp28
+    tl.debug_barrier()
+    tl.store(in_out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp49, None)

torchinductor_ch-epfl-345354-j/3m/c3mt4utggpr6zcsqyeele6646fofhvyk4xxtwll4gqqa5w6nrbct.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 2048, 'r0_': 262144},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax__to_copy_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 2, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused__log_softmax__to_copy_2(in_out_ptr0, in_ptr0, out_ptr0, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    _tmp3_max = tl.full([XBLOCK, R0_BLOCK], float('-inf'), tl.float32)
+    _tmp3_sum = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + ks2*x0 + 2*ks2*((6 + ks0*ks1) // 7)), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        _tmp3_max_next, _tmp3_sum_next = triton_helpers.online_softmax_combine(
+            _tmp3_max, _tmp3_sum, tmp2, False
+        )
+        _tmp3_max = tl.where(r0_mask & xmask, _tmp3_max_next, _tmp3_max)
+        _tmp3_sum = tl.where(r0_mask & xmask, _tmp3_sum_next, _tmp3_sum)
+    tmp5, tmp6 = triton_helpers.online_softmax_reduce(
+        _tmp3_max, _tmp3_sum, 1, False)
+    tmp5 = tmp5[:, None]
+    tmp6 = tmp6[:, None]
+    tmp3 = tmp5
+    tmp4 = tmp6
+    tl.store(out_ptr0 + (x0), tmp3, xmask)
+    tmp7 = tl_math.log(tmp4)
+    tl.debug_barrier()
+    tl.store(in_out_ptr0 + (x0), tmp7, xmask)

torchinductor_ch-epfl-345354-j/43/c43m5ctxi7dcy4hjgz5jijzo4xp7fp3bmvzcjp3ygmirxptgoerd.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 2048, 'r0_': 262144},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'out_ptr1': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_5', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 2, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused__to_copy_5(in_ptr0, out_ptr0, out_ptr1, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    _tmp3_max = tl.full([XBLOCK, R0_BLOCK], float('-inf'), tl.float32)
+    _tmp3_sum = tl.zeros([XBLOCK, R0_BLOCK], tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (r0_1 + ks2*x0 + 5*ks2*((6 + ks0*ks1) // 7)), r0_mask & xmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp1 = tmp0.to(tl.float32)
+        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+        _tmp3_max_next, _tmp3_sum_next = triton_helpers.online_softmax_combine(
+            _tmp3_max, _tmp3_sum, tmp2, False
+        )
+        _tmp3_max = tl.where(r0_mask & xmask, _tmp3_max_next, _tmp3_max)
+        _tmp3_sum = tl.where(r0_mask & xmask, _tmp3_sum_next, _tmp3_sum)
+    tmp5, tmp6 = triton_helpers.online_softmax_reduce(
+        _tmp3_max, _tmp3_sum, 1, False)
+    tmp5 = tmp5[:, None]
+    tmp6 = tmp6[:, None]
+    tmp3 = tmp5
+    tmp4 = tmp6
+    tl.store(out_ptr0 + (x0), tmp3, xmask)
+    tl.store(out_ptr1 + (x0), tmp4, xmask)

torchinductor_ch-epfl-345354-j/4i/c4iarmybewwgyq7pa6izmajgs66hg4cgb6yhmezt4tg6j77oklfi.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 8192, 'r0_': 1024},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': True, 'num_load': 2, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel):
+    XBLOCK: tl.constexpr = 1
+    r0_numel = 1024
+    R0_BLOCK: tl.constexpr = 1024
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = tl.full([1], xoffset, tl.int32)
+    xmask = tl.full([R0_BLOCK], True, tl.int1)
+    r0_index = tl.arange(0, R0_BLOCK)[:]
+    r0_offset = 0
+    r0_mask = tl.full([R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_1 + 1024*x0), None).to(tl.float32)
+    tmp11 = tl.load(in_ptr1 + (r0_1), None, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tmp1 * tmp1
+    tmp3 = tl.broadcast_to(tmp2, [R0_BLOCK])
+    tmp5 = triton_helpers.promote_to_tensor(tl.sum(tmp3, 0))
+    tmp6 = 1024.0
+    tmp7 = (tmp5 / tmp6)
+    tmp8 = 1e-06
+    tmp9 = tmp7 + tmp8
+    tmp10 = libdevice.rsqrt(tmp9)
+    tmp12 = tmp1 * tmp10
+    tmp13 = tmp12.to(tl.float32)
+    tmp14 = tmp11 * tmp13
+    tl.debug_barrier()
+    tl.store(in_out_ptr0 + (x0), tmp10, None)
+    tl.store(out_ptr0 + (r0_1 + 1024*x0), tmp14, None)

torchinductor_ch-epfl-345354-j/53/c53mrwlx5sxivgg5x5z6kkaldo2q5yn2pjsymcv27tpzj2cdoeww.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 1, 'r0_': 2048},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*i64', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*i1', 'out_ptr2': '*i64', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'xnumel': 'constexpr', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_nll_loss_backward_nll_loss_forward_11', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused_nll_loss_backward_nll_loss_forward_11(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, ks3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 1
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    _tmp27 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_0 = r0_index
+        tmp5 = tl.load(in_ptr1 + (((r0_0 + 2*((6 + ks0*ks1) // 7)) % (ks0*ks1))), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp19 = tl.load(in_ptr3 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp21 = tl.load(in_ptr4 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp0 = ((r0_0 + 2*((6 + ks0*ks1) // 7)) % ks1)
+        tmp1 = (-1) + ks1
+        tmp2 = tmp0 == tmp1
+        tmp3 = tmp0 < tmp1
+        tmp4 = tl.load(in_ptr0 + (tl.broadcast_to(1 + (((r0_0 + 2*((6 + ks0*ks1) // 7)) % (ks0*ks1))), [XBLOCK, R0_BLOCK])), r0_mask & tmp3, eviction_policy='evict_last', other=0.0)
+        tmp6 = tl.where(tmp3, tmp4, tmp5)
+        tmp7 = tl.full([1, 1], -100, tl.int64)
+        tmp8 = tl.where(tmp2, tmp7, tmp6)
+        tmp9 = tmp8 != tmp7
+        tmp10 = tl.full([1, 1], 0, tl.int64)
+        tmp11 = tl.where(tmp9, tmp8, tmp10)
+        tmp12 = ks2
+        tmp13 = tmp11 + tmp12
+        tmp14 = tmp11 < 0
+        tmp15 = tl.where(tmp14, tmp13, tmp11)
+        tl.device_assert(((0 <= tmp15) & (tmp15 < ks2)) | ~(r0_mask), "index out of bounds: 0 <= tmp15 < ks2")
+        tmp17 = tl.load(in_ptr2 + (tmp15 + ks2*r0_0 + 2*ks2*((6 + ks0*ks3) // 7)), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp20 = tmp18 - tmp19
+        tmp22 = tmp20 - tmp21
+        tmp23 = -tmp22
+        tmp24 = 0.0
+        tmp25 = tl.where(tmp9, tmp23, tmp24)
+        tmp26 = tl.broadcast_to(tmp25, [XBLOCK, R0_BLOCK])
+        tmp28 = _tmp27 + tmp26
+        _tmp27 = tl.where(r0_mask, tmp28, _tmp27)
+        tl.store(out_ptr1 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp9, r0_mask)
+        tl.store(out_ptr2 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp11, r0_mask)
+    tmp27 = tl.sum(_tmp27, 1)[:, None]
+    tl.store(out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp27, None)

torchinductor_ch-epfl-345354-j/56/c56q66j66nfzeu5puvuhal4wt2foih6rnb5nwmqolafn3iq33kjp.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 1, 'r0_': 2048},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*i64', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*i1', 'out_ptr2': '*i64', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'xnumel': 'constexpr', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_nll_loss_backward_nll_loss_forward_10', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused_nll_loss_backward_nll_loss_forward_10(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, ks3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 1
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    _tmp27 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_0 = r0_index
+        tmp5 = tl.load(in_ptr1 + (((r0_0 + ((6 + ks0*ks1) // 7)) % (ks0*ks1))), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp19 = tl.load(in_ptr3 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp21 = tl.load(in_ptr4 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp0 = ((r0_0 + ((6 + ks0*ks1) // 7)) % ks1)
+        tmp1 = (-1) + ks1
+        tmp2 = tmp0 == tmp1
+        tmp3 = tmp0 < tmp1
+        tmp4 = tl.load(in_ptr0 + (tl.broadcast_to(1 + (((r0_0 + ((6 + ks0*ks1) // 7)) % (ks0*ks1))), [XBLOCK, R0_BLOCK])), r0_mask & tmp3, eviction_policy='evict_last', other=0.0)
+        tmp6 = tl.where(tmp3, tmp4, tmp5)
+        tmp7 = tl.full([1, 1], -100, tl.int64)
+        tmp8 = tl.where(tmp2, tmp7, tmp6)
+        tmp9 = tmp8 != tmp7
+        tmp10 = tl.full([1, 1], 0, tl.int64)
+        tmp11 = tl.where(tmp9, tmp8, tmp10)
+        tmp12 = ks2
+        tmp13 = tmp11 + tmp12
+        tmp14 = tmp11 < 0
+        tmp15 = tl.where(tmp14, tmp13, tmp11)
+        tl.device_assert(((0 <= tmp15) & (tmp15 < ks2)) | ~(r0_mask), "index out of bounds: 0 <= tmp15 < ks2")
+        tmp17 = tl.load(in_ptr2 + (tmp15 + ks2*r0_0 + ks2*((6 + ks0*ks3) // 7)), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp20 = tmp18 - tmp19
+        tmp22 = tmp20 - tmp21
+        tmp23 = -tmp22
+        tmp24 = 0.0
+        tmp25 = tl.where(tmp9, tmp23, tmp24)
+        tmp26 = tl.broadcast_to(tmp25, [XBLOCK, R0_BLOCK])
+        tmp28 = _tmp27 + tmp26
+        _tmp27 = tl.where(r0_mask, tmp28, _tmp27)
+        tl.store(out_ptr1 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp9, r0_mask)
+        tl.store(out_ptr2 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp11, r0_mask)
+    tmp27 = tl.sum(_tmp27, 1)[:, None]
+    tl.store(out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp27, None)

torchinductor_ch-epfl-345354-j/57/c573irrqes6p6it4yfyvqw2efgfbbgp7yjzxjjxq5jpeesj3bi77.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""
+Compile-time auto-tuning block:
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import preserve_rng_state
+from torch._inductor.select_algorithm import AlgorithmSelectorCache
+from torch._inductor.async_compile import AsyncCompile
+async_compile = AsyncCompile()
+generate_example_value = AlgorithmSelectorCache.generate_example_value
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+triton_poi_fused_add_cat_mul_0 = async_compile.triton('triton_poi_fused_add_cat_mul_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_cat_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_cat_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x4 = xindex
+    x0 = (xindex % ks0)
+    x2 = ((xindex // ks1) % ks2)
+    x5 = xindex // ks0
+    tmp0 = tl.load(in_ptr0 + (x4), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp17 = tl.load(in_ptr2 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tmp0 * tmp1
+    tmp3 = x0
+    tmp4 = tl.full([1], 0, tl.int64)
+    tmp5 = tmp3 >= tmp4
+    tmp6 = ks0 + (-1)*(ks0 // 2)
+    tmp7 = tmp3 < tmp6
+    tmp8 = tl.load(in_ptr0 + (ks0*x5 + (ks0 // 2) + (x0)), xmask & tmp7, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp9 = -tmp8
+    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
+    tmp11 = tl.where(tmp7, tmp9, tmp10)
+    tmp12 = tmp3 >= tmp6
+    tmp13 = ks0
+    tmp14 = tmp3 < tmp13
+    tmp15 = tl.load(in_ptr0 + (ks0*x5 + (x0 + ((-1)*ks0) + (ks0 // 2))), xmask & tmp12, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp16 = tl.where(tmp7, tmp11, tmp15)
+    tmp18 = tmp16 * tmp17
+    tmp19 = tmp2 + tmp18
+    tl.store(out_ptr0 + (x4), tmp19, xmask)
+''', device_str='cuda')
+triton_poi_fused_add_cat_mul_1 = async_compile.triton('triton_poi_fused_add_cat_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_cat_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_cat_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x4 = xindex
+    x0 = (xindex % ks0)
+    x2 = ((xindex // ks1) % ks2)
+    x5 = xindex // ks0
+    tmp0 = tl.load(in_ptr0 + (x4), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp17 = tl.load(in_ptr2 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tmp0 * tmp1
+    tmp3 = x0
+    tmp4 = tl.full([1], 0, tl.int64)
+    tmp5 = tmp3 >= tmp4
+    tmp6 = ks0 + (-1)*(ks0 // 2)
+    tmp7 = tmp3 < tmp6
+    tmp8 = tl.load(in_ptr0 + (ks0*x5 + (ks0 // 2) + (x0)), xmask & tmp7, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp9 = -tmp8
+    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
+    tmp11 = tl.where(tmp7, tmp9, tmp10)
+    tmp12 = tmp3 >= tmp6
+    tmp13 = ks0
+    tmp14 = tmp3 < tmp13
+    tmp15 = tl.load(in_ptr0 + (ks0*x5 + (x0 + ((-1)*ks0) + (ks0 // 2))), xmask & tmp12, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp16 = tl.where(tmp7, tmp11, tmp15)
+    tmp18 = tmp16 * tmp17
+    tmp19 = tmp2 + tmp18
+    tl.store(out_ptr0 + (x4), tmp19, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+with torch.cuda._DeviceGuard(0):
+    torch.cuda.set_device(0)
+    stream0 = get_raw_stream(0)
+    from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+    stream0 = get_raw_stream(0)
+    arg7_1 = generate_example_value((8, 16, 1000, 128), (2048000, 128, 2048, 1), 'cuda:0', torch.bfloat16, 0, (8, 16, 1000, 128))
+    arg2_1 = generate_example_value((1, 1000, 128), (128000, 128, 1), 'cuda:0', torch.bfloat16, 0, (1, 1000, 128))
+    arg4_1 = generate_example_value((1, 1000, 128), (128000, 128, 1), 'cuda:0', torch.bfloat16, 0, (1, 1000, 128))
+    buf0 = generate_example_value((8, 16, 1000, 128), (2048000, 128, 2048, 1), 'cuda:0', torch.bfloat16, 0, (8, 16, 1000, 128))
+    triton_poi_fused_add_cat_mul_0.run(arg7_1, arg2_1, arg4_1, buf0, 128, 2048, 1000, 16384000, stream=stream0)
+    del arg7_1, arg2_1, arg4_1, buf0
+    stream0 = get_raw_stream(0)
+    arg8_1 = generate_example_value((8, 8, 1000, 128), (1024000, 128, 1024, 1), 'cuda:0', torch.bfloat16, 0, (8, 8, 1000, 128))
+    arg2_1 = generate_example_value((1, 1000, 128), (128000, 128, 1), 'cuda:0', torch.bfloat16, 0, (1, 1000, 128))
+    arg4_1 = generate_example_value((1, 1000, 128), (128000, 128, 1), 'cuda:0', torch.bfloat16, 0, (1, 1000, 128))
+    buf1 = generate_example_value((8, 8, 1000, 128), (1024000, 128, 1024, 1), 'cuda:0', torch.bfloat16, 0, (8, 8, 1000, 128))
+    triton_poi_fused_add_cat_mul_1.run(arg8_1, arg2_1, arg4_1, buf1, 128, 1024, 1000, 8192000, stream=stream0)
+    del arg8_1, arg2_1, arg4_1, buf1
+"""
+# AOT ID: ['3_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/nf/cnf7ddto2mtv7utbz2ev3zn2hkgmh5nivfwjn3kwh7vb2j3fnuyw.py
+# Topologically Sorted Source Nodes: [mul, cat, mul_1, q_embed], Original ATen: [aten.mul, aten.cat, aten.add]
+# Source node to ATen node mapping:
+#   cat => cat
+#   mul => mul_8
+#   mul_1 => mul_29
+#   q_embed => add_36
+# Graph fragment:
+#   %mul_8 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg7_1, %unsqueeze), kwargs = {})
+#   %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%neg, %slice_1], -1), kwargs = {})
+#   %mul_29 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat, %unsqueeze_1), kwargs = {})
+#   %add_36 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_8, %mul_29), kwargs = {})
+triton_poi_fused_add_cat_mul_0 = async_compile.triton('triton_poi_fused_add_cat_mul_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 16777216},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_cat_mul_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_cat_mul_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x4 = xindex
+    x0 = (xindex % ks0)
+    x2 = ((xindex // ks1) % ks2)
+    x5 = xindex // ks0
+    tmp0 = tl.load(in_ptr0 + (x4), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp17 = tl.load(in_ptr2 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tmp0 * tmp1
+    tmp3 = x0
+    tmp4 = tl.full([1], 0, tl.int64)
+    tmp5 = tmp3 >= tmp4
+    tmp6 = ks0 + (-1)*(ks0 // 2)
+    tmp7 = tmp3 < tmp6
+    tmp8 = tl.load(in_ptr0 + (ks0*x5 + (ks0 // 2) + (x0)), xmask & tmp7, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp9 = -tmp8
+    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
+    tmp11 = tl.where(tmp7, tmp9, tmp10)
+    tmp12 = tmp3 >= tmp6
+    tmp13 = ks0
+    tmp14 = tmp3 < tmp13
+    tmp15 = tl.load(in_ptr0 + (ks0*x5 + (x0 + ((-1)*ks0) + (ks0 // 2))), xmask & tmp12, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp16 = tl.where(tmp7, tmp11, tmp15)
+    tmp18 = tmp16 * tmp17
+    tmp19 = tmp2 + tmp18
+    tl.store(out_ptr0 + (x4), tmp19, xmask)
+''', device_str='cuda')
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/2f/c2fzymhcr3rme5dtns3jvvyp6x3osfrlp7cc7zg7igwzigqmmg65.py
+# Topologically Sorted Source Nodes: [mul_2, cat_1, mul_3, k_embed], Original ATen: [aten.mul, aten.cat, aten.add]
+# Source node to ATen node mapping:
+#   cat_1 => cat_1
+#   k_embed => add_72
+#   mul_2 => mul_38
+#   mul_3 => mul_59
+# Graph fragment:
+#   %mul_38 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg8_1, %unsqueeze), kwargs = {})
+#   %cat_1 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%neg_1, %slice_3], -1), kwargs = {})
+#   %mul_59 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%cat_1, %unsqueeze_1), kwargs = {})
+#   %add_72 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_38, %mul_59), kwargs = {})
+triton_poi_fused_add_cat_mul_1 = async_compile.triton('triton_poi_fused_add_cat_mul_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'out_ptr0': '*bf16', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_cat_mul_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_cat_mul_1(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x4 = xindex
+    x0 = (xindex % ks0)
+    x2 = ((xindex // ks1) % ks2)
+    x5 = xindex // ks0
+    tmp0 = tl.load(in_ptr0 + (x4), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp17 = tl.load(in_ptr2 + (x0 + ks0*x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp2 = tmp0 * tmp1
+    tmp3 = x0
+    tmp4 = tl.full([1], 0, tl.int64)
+    tmp5 = tmp3 >= tmp4
+    tmp6 = ks0 + (-1)*(ks0 // 2)
+    tmp7 = tmp3 < tmp6
+    tmp8 = tl.load(in_ptr0 + (ks0*x5 + (ks0 // 2) + (x0)), xmask & tmp7, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp9 = -tmp8
+    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
+    tmp11 = tl.where(tmp7, tmp9, tmp10)
+    tmp12 = tmp3 >= tmp6
+    tmp13 = ks0
+    tmp14 = tmp3 < tmp13
+    tmp15 = tl.load(in_ptr0 + (ks0*x5 + (x0 + ((-1)*ks0) + (ks0 // 2))), xmask & tmp12, eviction_policy='evict_last', other=0.0).to(tl.float32)
+    tmp16 = tl.where(tmp7, tmp11, tmp15)
+    tmp18 = tmp16 * tmp17
+    tmp19 = tmp2 + tmp18
+    tl.store(out_ptr0 + (x4), tmp19, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+def call(args):
+    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1 = args
+    args.clear()
+    s0 = arg0_1
+    s1 = arg1_1
+    s7 = arg5_1
+    s8 = arg6_1
+    assert_size_stride(arg2_1, (1, s0, s1), (s0*s1, s1, 1))
+    assert_size_stride(arg4_1, (1, s0, s1), (s0*s1, s1, 1))
+    assert_size_stride(arg7_1, (s7, s8, s0, s1), (s0*s1*s8, s1, s1*s8, 1))
+    assert_size_stride(arg8_1, (s7, s7, s0, s1), (s0*s1*s7, s1, s1*s7, 1))
+    with torch.cuda._DeviceGuard(0):
+        torch.cuda.set_device(0)
+        ps0 = s1*s8
+        pool1 = empty_strided_cuda((s7, s8, s0, s1), (s0*s1*s8, s1, s1*s8, 1), torch.bfloat16)
+        buf0 = pool1  # alloc
+        # Topologically Sorted Source Nodes: [mul, cat, mul_1, q_embed], Original ATen: [aten.mul, aten.cat, aten.add]
+        triton_poi_fused_add_cat_mul_0_xnumel = s0*s1*s7*s8
+        stream0 = get_raw_stream(0)
+        triton_poi_fused_add_cat_mul_0.run(arg7_1, arg2_1, arg4_1, buf0, s1, ps0, s0, triton_poi_fused_add_cat_mul_0_xnumel, stream=stream0)
+        del arg7_1
+        ps1 = s1*s7
+        pool0 = empty_strided_cuda((s7, s7, s0, s1), (s0*s1*s7, s1, s1*s7, 1), torch.bfloat16)
+        buf1 = pool0  # alloc
+        # Topologically Sorted Source Nodes: [mul_2, cat_1, mul_3, k_embed], Original ATen: [aten.mul, aten.cat, aten.add]
+        triton_poi_fused_add_cat_mul_1_xnumel = s0*s1*s7*s7
+        stream0 = get_raw_stream(0)
+        triton_poi_fused_add_cat_mul_1.run(arg8_1, arg2_1, arg4_1, buf1, s1, ps1, s0, triton_poi_fused_add_cat_mul_1_xnumel, stream=stream0)
+        del arg2_1
+        del arg4_1
+        del arg8_1
+    return (buf0, buf1, )
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 1000
+    arg1_1 = 128
+    arg2_1 = rand_strided((1, 1000, 128), (128000, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg3_1 = 1
+    arg4_1 = rand_strided((1, 1000, 128), (128000, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg5_1 = 8
+    arg6_1 = 16
+    arg7_1 = rand_strided((8, 16, 1000, 128), (2048000, 128, 2048, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg8_1 = rand_strided((8, 8, 1000, 128), (1024000, 128, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor_ch-epfl-345354-j/57/c574kngiopy3pgespyoupnzlae4d5tokyeui7uglwglnym2qijvn.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_0(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(in_out_ptr0 + (x0), tmp6, xmask)

torchinductor_ch-epfl-345354-j/57/dad7be19dc394c1e08368515640dff88b78797aaabae100a15a1f195476a9a87.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 67}

torchinductor_ch-epfl-345354-j/5e/c5enonf6qztlsw7dozsqkejk4exzt4n56gbz6fiey2gnus5vdf76.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 1, 'r0_': 2048},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*i64', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'out_ptr0': '*fp32', 'out_ptr1': '*i1', 'out_ptr2': '*i64', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'ks3': 'i32', 'xnumel': 'constexpr', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused_nll_loss_backward_nll_loss_forward_12', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused_nll_loss_backward_nll_loss_forward_12(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr1, out_ptr2, ks0, ks1, ks2, ks3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 1
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    _tmp27 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_0 = r0_index
+        tmp5 = tl.load(in_ptr1 + (((r0_0 + 3*((6 + ks0*ks1) // 7)) % (ks0*ks1))), r0_mask, eviction_policy='evict_last', other=0.0)
+        tmp19 = tl.load(in_ptr3 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp21 = tl.load(in_ptr4 + (r0_0), r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp0 = ((r0_0 + 3*((6 + ks0*ks1) // 7)) % ks1)
+        tmp1 = (-1) + ks1
+        tmp2 = tmp0 == tmp1
+        tmp3 = tmp0 < tmp1
+        tmp4 = tl.load(in_ptr0 + (tl.broadcast_to(1 + (((r0_0 + 3*((6 + ks0*ks1) // 7)) % (ks0*ks1))), [XBLOCK, R0_BLOCK])), r0_mask & tmp3, eviction_policy='evict_last', other=0.0)
+        tmp6 = tl.where(tmp3, tmp4, tmp5)
+        tmp7 = tl.full([1, 1], -100, tl.int64)
+        tmp8 = tl.where(tmp2, tmp7, tmp6)
+        tmp9 = tmp8 != tmp7
+        tmp10 = tl.full([1, 1], 0, tl.int64)
+        tmp11 = tl.where(tmp9, tmp8, tmp10)
+        tmp12 = ks2
+        tmp13 = tmp11 + tmp12
+        tmp14 = tmp11 < 0
+        tmp15 = tl.where(tmp14, tmp13, tmp11)
+        tl.device_assert(((0 <= tmp15) & (tmp15 < ks2)) | ~(r0_mask), "index out of bounds: 0 <= tmp15 < ks2")
+        tmp17 = tl.load(in_ptr2 + (tmp15 + ks2*r0_0 + 3*ks2*((6 + ks0*ks3) // 7)), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
+        tmp18 = tmp17.to(tl.float32)
+        tmp20 = tmp18 - tmp19
+        tmp22 = tmp20 - tmp21
+        tmp23 = -tmp22
+        tmp24 = 0.0
+        tmp25 = tl.where(tmp9, tmp23, tmp24)
+        tmp26 = tl.broadcast_to(tmp25, [XBLOCK, R0_BLOCK])
+        tmp28 = _tmp27 + tmp26
+        _tmp27 = tl.where(r0_mask, tmp28, _tmp27)
+        tl.store(out_ptr1 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp9, r0_mask)
+        tl.store(out_ptr2 + (tl.broadcast_to(r0_0, [XBLOCK, R0_BLOCK])), tmp11, r0_mask)
+    tmp27 = tl.sum(_tmp27, 1)[:, None]
+    tl.store(out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp27, None)

torchinductor_ch-epfl-345354-j/5x/c5xsvywggx5vrzm2l5uaktu7pipclhdn5h6263yru2ugvuhe2nak.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 8192, 'r0_': 1024},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_div_mul_pow_sum_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': True, 'num_load': 4, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_add_div_mul_pow_sum_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr1, xnumel, r0_numel):
+    XBLOCK: tl.constexpr = 1
+    r0_numel = 1024
+    R0_BLOCK: tl.constexpr = 1024
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = tl.full([1], xoffset, tl.int32)
+    xmask = tl.full([R0_BLOCK], True, tl.int1)
+    r0_index = tl.arange(0, R0_BLOCK)[:]
+    r0_offset = 0
+    r0_mask = tl.full([R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_1 + 1024*x0), None).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (r0_1), None, eviction_policy='evict_last').to(tl.float32)
+    tmp4 = tl.load(in_ptr2 + (r0_1 + 1024*x0), None).to(tl.float32)
+    tmp10 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
+    tmp2 = tmp0 * tmp1
+    tmp3 = tmp2.to(tl.float32)
+    tmp5 = tmp4.to(tl.float32)
+    tmp6 = tmp3 * tmp5
+    tmp7 = tl.broadcast_to(tmp6, [R0_BLOCK])
+    tmp9 = triton_helpers.promote_to_tensor(tl.sum(tmp7, 0))
+    tmp11 = tmp3 * tmp10
+    tmp12 = -0.5
+    tmp13 = tmp9 * tmp12
+    tmp14 = tmp10 * tmp10
+    tmp15 = tmp14 * tmp10
+    tmp16 = tmp13 * tmp15
+    tmp17 = 0.0009765625
+    tmp18 = tmp16 * tmp17
+    tmp19 = 2.0
+    tmp20 = tmp5 * tmp19
+    tmp21 = tmp18 * tmp20
+    tmp22 = tmp11 + tmp21
+    tmp23 = tmp22.to(tl.float32)
+    tl.store(out_ptr1 + (r0_1 + 1024*x0), tmp23, None)

torchinductor_ch-epfl-345354-j/6d/c6dsbxlebwjqawzeprkq3lkldtxoiept4c6bpgtva5r4mjlrnwlr.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+Compile-time auto-tuning block:
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import preserve_rng_state
+from torch._inductor.select_algorithm import AlgorithmSelectorCache
+from torch._inductor.async_compile import AsyncCompile
+async_compile = AsyncCompile()
+generate_example_value = AlgorithmSelectorCache.generate_example_value
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0 = async_compile.triton('triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0(in_ptr0, in_ptr1, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 128
+    R0_BLOCK: tl.constexpr = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_1 + 128*x0), xmask, other=0.0).to(tl.float32)
+    tmp7 = tl.load(in_ptr1 + (r0_1), None, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tmp1 * tmp1
+    tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+    tmp5 = tl.where(xmask, tmp3, 0)
+    tmp6 = tl.sum(tmp5, 1)[:, None]
+    tmp8 = 128.0
+    tmp9 = (tmp6 / tmp8)
+    tmp10 = 1e-06
+    tmp11 = tmp9 + tmp10
+    tmp12 = libdevice.rsqrt(tmp11)
+    tmp13 = tmp1 * tmp12
+    tmp14 = tmp13.to(tl.float32)
+    tmp15 = tmp7 * tmp14
+    tl.store(out_ptr1 + (r0_1 + 128*x0), tmp15, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+with torch.cuda._DeviceGuard(0):
+    torch.cuda.set_device(0)
+    stream0 = get_raw_stream(0)
+    from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+    stream0 = get_raw_stream(0)
+    arg3_1 = generate_example_value((8, 1000, 16, 128), (2048000, 2048, 128, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 16, 128))
+    arg4_1 = generate_example_value((128,), (1,), 'cuda:0', torch.bfloat16, 0, (128,))
+    buf1 = generate_example_value((8, 1000, 16, 128), (2048000, 2048, 128, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 16, 128))
+    triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0.run(arg3_1, arg4_1, buf1, 128000, 128, stream=stream0)
+    del arg3_1, arg4_1, buf1
+"""
+# AOT ID: ['2_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/l7/cl7z3tsg3vjxfjb3vqjym4iefsq6h5o7fsfsmbi65q55e56x3lm7.py
+# Topologically Sorted Source Nodes: [hidden_states, pow_1, variance, add, rsqrt, hidden_states_1, to_1, mul_1], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
+# Source node to ATen node mapping:
+#   add => add_15
+#   hidden_states => convert_element_type
+#   hidden_states_1 => mul_17
+#   mul_1 => mul_26
+#   pow_1 => pow_1
+#   rsqrt => rsqrt
+#   to_1 => convert_element_type_1
+#   variance => mean
+# Graph fragment:
+#   %convert_element_type : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg3_1, torch.float32), kwargs = {})
+#   %pow_1 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 2), kwargs = {})
+#   %mean : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [-1], True), kwargs = {})
+#   %add_15 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mean, 1e-06), kwargs = {})
+#   %rsqrt : [num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_15,), kwargs = {})
+#   %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_17, torch.bfloat16), kwargs = {})
+#   %mul_26 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%arg4_1, %convert_element_type_1), kwargs = {})
+triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0 = async_compile.triton('triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0(in_ptr0, in_ptr1, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 128
+    R0_BLOCK: tl.constexpr = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_1 + 128*x0), xmask, other=0.0).to(tl.float32)
+    tmp7 = tl.load(in_ptr1 + (r0_1), None, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tmp1 * tmp1
+    tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+    tmp5 = tl.where(xmask, tmp3, 0)
+    tmp6 = tl.sum(tmp5, 1)[:, None]
+    tmp8 = 128.0
+    tmp9 = (tmp6 / tmp8)
+    tmp10 = 1e-06
+    tmp11 = tmp9 + tmp10
+    tmp12 = libdevice.rsqrt(tmp11)
+    tmp13 = tmp1 * tmp12
+    tmp14 = tmp13.to(tl.float32)
+    tmp15 = tmp7 * tmp14
+    tl.store(out_ptr1 + (r0_1 + 128*x0), tmp15, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+def call(args):
+    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args
+    args.clear()
+    s3 = arg0_1
+    s4 = arg1_1
+    s5 = arg2_1
+    assert_size_stride(arg3_1, (s3, s4, s5, 128), (128*s4*s5, 128*s5, 128, 1))
+    assert_size_stride(arg4_1, (128, ), (1, ))
+    with torch.cuda._DeviceGuard(0):
+        torch.cuda.set_device(0)
+        pool0 = empty_strided_cuda((s3, s4, s5, 128), (128*s4*s5, 128*s5, 128, 1), torch.bfloat16)
+        buf1 = pool0  # alloc
+        # Topologically Sorted Source Nodes: [hidden_states, pow_1, variance, add, rsqrt, hidden_states_1, to_1, mul_1], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
+        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0_xnumel = s3*s4*s5
+        stream0 = get_raw_stream(0)
+        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0.run(arg3_1, arg4_1, buf1, triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0_xnumel, 128, stream=stream0)
+        del arg3_1
+        del arg4_1
+    return (buf1, )
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = 8
+    arg1_1 = 1000
+    arg2_1 = 16
+    arg3_1 = rand_strided((8, 1000, 16, 128), (2048000, 2048, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg4_1 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor_ch-epfl-345354-j/6j/7c215475e7b40a21cf286026270965eb7f07e7c3af1c4052d331de3f74c6449e.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 1024, "num_warps": 4, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 68}

torchinductor_ch-epfl-345354-j/6j/c6j5lx5qgycfvyi3dm5f4mo3ssluzzsrmdq32pka7e6pyhg42zvd.py ADDED Viewed

	@@ -0,0 +1,499 @@

+"""
+Compile-time auto-tuning block:
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import preserve_rng_state
+from torch._inductor.select_algorithm import AlgorithmSelectorCache
+from torch._inductor.async_compile import AsyncCompile
+async_compile = AsyncCompile()
+generate_example_value = AlgorithmSelectorCache.generate_example_value
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+triton_red_fused__to_copy_mul_sum_0 = async_compile.triton('triton_red_fused__to_copy_mul_sum_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 65536, 'r0_': 512},
+    reduction_hint=ReductionHint.OUTER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_mul_sum_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused__to_copy_mul_sum_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 40960
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x1 = xindex // 128
+    x0 = (xindex % 128)
+    _tmp13 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = r0_2 + x1*((319 + ks0*ks1*ks2) // 320)
+        tmp1 = ks0*ks1*ks2
+        tmp2 = tmp0 < tmp1
+        tmp3 = tl.load(in_ptr0 + (x0 + 128*(((r0_2 + x1*((319 + ks0*ks1*ks2) // 320)) % (ks0*ks1*ks2)))), r0_mask & tmp2, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp4 = tl.load(in_ptr1 + (x0 + 128*(((r0_2 + x1*((319 + ks0*ks1*ks2) // 320)) % (ks0*ks1*ks2)))), r0_mask & tmp2, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp5 = tmp4.to(tl.float32)
+        tmp6 = tl.load(in_ptr2 + (((r0_2 + x1*((319 + ks0*ks1*ks2) // 320)) % (ks0*ks1*ks2))), r0_mask & tmp2, eviction_policy='evict_last', other=0.0)
+        tmp7 = tmp5 * tmp6
+        tmp8 = tmp7.to(tl.float32)
+        tmp9 = tmp3 * tmp8
+        tmp10 = tl.full(tmp9.shape, 0, tmp9.dtype)
+        tmp11 = tl.where(tmp2, tmp9, tmp10)
+        tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
+        tmp14 = _tmp13 + tmp12
+        _tmp13 = tl.where(r0_mask, tmp14, _tmp13)
+    tmp13 = tl.sum(_tmp13, 1)[:, None]
+    tl.store(out_ptr0 + (x3), tmp13, None)
+''', device_str='cuda')
+triton_red_fused__to_copy_mul_sum_1 = async_compile.triton('triton_red_fused__to_copy_mul_sum_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 128, 'r0_': 512},
+    reduction_hint=ReductionHint.OUTER_TINY,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_mul_sum_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused__to_copy_mul_sum_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 128
+    r0_numel = 320
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    _tmp2 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (x0 + 128*r0_1), xmask & r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
+        tmp3 = _tmp2 + tmp1
+        _tmp2 = tl.where(r0_mask & xmask, tmp3, _tmp2)
+    tmp2 = tl.sum(_tmp2, 1)[:, None]
+    tl.store(out_ptr0 + (x0), tmp2, xmask)
+''', device_str='cuda')
+triton_per_fused__to_copy_add_div_mul_pow_sum_2 = async_compile.triton('triton_per_fused__to_copy_add_div_mul_pow_sum_2', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_div_mul_pow_sum_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_add_div_mul_pow_sum_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 128
+    R0_BLOCK: tl.constexpr = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_1 + 128*x0), xmask, other=0.0).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (r0_1), None, eviction_policy='evict_last').to(tl.float32)
+    tmp4 = tl.load(in_ptr2 + (r0_1 + 128*x0), xmask, other=0.0).to(tl.float32)
+    tmp11 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+    tmp2 = tmp0 * tmp1
+    tmp3 = tmp2.to(tl.float32)
+    tmp5 = tmp4.to(tl.float32)
+    tmp6 = tmp3 * tmp5
+    tmp7 = tl.broadcast_to(tmp6, [XBLOCK, R0_BLOCK])
+    tmp9 = tl.where(xmask, tmp7, 0)
+    tmp10 = tl.sum(tmp9, 1)[:, None]
+    tmp12 = tmp3 * tmp11
+    tmp13 = -0.5
+    tmp14 = tmp10 * tmp13
+    tmp15 = tmp11 * tmp11
+    tmp16 = tmp15 * tmp11
+    tmp17 = tmp14 * tmp16
+    tmp18 = 0.0078125
+    tmp19 = tmp17 * tmp18
+    tmp20 = 2.0
+    tmp21 = tmp5 * tmp20
+    tmp22 = tmp19 * tmp21
+    tmp23 = tmp12 + tmp22
+    tmp24 = tmp23.to(tl.float32)
+    tl.store(out_ptr1 + (r0_1 + 128*x0), tmp24, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+with torch.cuda._DeviceGuard(0):
+    torch.cuda.set_device(0)
+    stream0 = get_raw_stream(0)
+    from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+    stream0 = get_raw_stream(0)
+    tangents_1 = generate_example_value((8, 1000, 16, 128), (2048000, 2048, 128, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 16, 128))
+    primals_4 = generate_example_value((8, 1000, 16, 128), (2048000, 2048, 128, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 16, 128))
+    rsqrt = generate_example_value((8, 1000, 16, 1), (16000, 16, 1, 1), 'cuda:0', torch.float32, 0, (8, 1000, 16, 1))
+    buf0 = generate_example_value((1, 1, 1, 128, 320), (40960, 40960, 40960, 1, 128), 'cuda:0', torch.float32, 0, (1, 1, 1, 128, 320))
+    triton_red_fused__to_copy_mul_sum_0.run(tangents_1, primals_4, rsqrt, buf0, 8, 1000, 16, 40960, 400, stream=stream0)
+    del tangents_1, primals_4, rsqrt, buf0
+    stream0 = get_raw_stream(0)
+    buf0 = generate_example_value((1, 1, 1, 128, 320), (40960, 40960, 40960, 1, 128), 'cuda:0', torch.float32, 0, (1, 1, 1, 128, 320))
+    buf1 = generate_example_value((1, 1, 1, 128), (128, 128, 128, 1), 'cuda:0', torch.bfloat16, 0, (1, 1, 1, 128))
+    triton_red_fused__to_copy_mul_sum_1.run(buf0, buf1, 128, 320, stream=stream0)
+    del buf0, buf1
+    stream0 = get_raw_stream(0)
+    tangents_1 = generate_example_value((8, 1000, 16, 128), (2048000, 2048, 128, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 16, 128))
+    primals_5 = generate_example_value((128,), (1,), 'cuda:0', torch.bfloat16, 0, (128,))
+    primals_4 = generate_example_value((8, 1000, 16, 128), (2048000, 2048, 128, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 16, 128))
+    rsqrt = generate_example_value((8, 1000, 16, 1), (16000, 16, 1, 1), 'cuda:0', torch.float32, 0, (8, 1000, 16, 1))
+    buf3 = generate_example_value((8, 1000, 16, 128), (2048000, 2048, 128, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 16, 128))
+    triton_per_fused__to_copy_add_div_mul_pow_sum_2.run(tangents_1, primals_5, primals_4, rsqrt, buf3, 128000, 128, stream=stream0)
+    del tangents_1, primals_5, primals_4, rsqrt, buf3
+"""
+# AOT ID: ['9_backward']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/mm/cmmurv7ol6o3kll2wm4b6wgtdca4tsysrq7yrhhvfkf7ikm72y24.py
+# Topologically Sorted Source Nodes: [hidden_states, hidden_states_1, to_1], Original ATen: [aten._to_copy, aten.mul, aten.sum]
+# Source node to ATen node mapping:
+#   hidden_states => convert_element_type
+#   hidden_states_1 => mul_23
+#   to_1 => convert_element_type_1
+# Graph fragment:
+#   %convert_element_type : [num_users=3] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%primals_4, torch.float32), kwargs = {})
+#   %mul_23 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_23, torch.bfloat16), kwargs = {})
+#   %mul_38 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%tangents_1, %convert_element_type_1), kwargs = {})
+#   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_38, [0, 1, 2], True), kwargs = {})
+triton_red_fused__to_copy_mul_sum_0 = async_compile.triton('triton_red_fused__to_copy_mul_sum_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 65536, 'r0_': 512},
+    reduction_hint=ReductionHint.OUTER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'out_ptr0': '*fp32', 'ks0': 'i32', 'ks1': 'i32', 'ks2': 'i32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_mul_sum_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused__to_copy_mul_sum_0(in_ptr0, in_ptr1, in_ptr2, out_ptr0, ks0, ks1, ks2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 40960
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x1 = xindex // 128
+    x0 = (xindex % 128)
+    _tmp13 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    x3 = xindex
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_2 = r0_index
+        tmp0 = r0_2 + x1*((319 + ks0*ks1*ks2) // 320)
+        tmp1 = ks0*ks1*ks2
+        tmp2 = tmp0 < tmp1
+        tmp3 = tl.load(in_ptr0 + (x0 + 128*(((r0_2 + x1*((319 + ks0*ks1*ks2) // 320)) % (ks0*ks1*ks2)))), r0_mask & tmp2, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp4 = tl.load(in_ptr1 + (x0 + 128*(((r0_2 + x1*((319 + ks0*ks1*ks2) // 320)) % (ks0*ks1*ks2)))), r0_mask & tmp2, eviction_policy='evict_first', other=0.0).to(tl.float32)
+        tmp5 = tmp4.to(tl.float32)
+        tmp6 = tl.load(in_ptr2 + (((r0_2 + x1*((319 + ks0*ks1*ks2) // 320)) % (ks0*ks1*ks2))), r0_mask & tmp2, eviction_policy='evict_last', other=0.0)
+        tmp7 = tmp5 * tmp6
+        tmp8 = tmp7.to(tl.float32)
+        tmp9 = tmp3 * tmp8
+        tmp10 = tl.full(tmp9.shape, 0, tmp9.dtype)
+        tmp11 = tl.where(tmp2, tmp9, tmp10)
+        tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
+        tmp14 = _tmp13 + tmp12
+        _tmp13 = tl.where(r0_mask, tmp14, _tmp13)
+    tmp13 = tl.sum(_tmp13, 1)[:, None]
+    tl.store(out_ptr0 + (x3), tmp13, None)
+''', device_str='cuda')
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/om/com66ihngf42hseqjadt4jcosvwffgk5ynrurmthhraywffjqcop.py
+# Topologically Sorted Source Nodes: [hidden_states, hidden_states_1, to_1], Original ATen: [aten._to_copy, aten.mul, aten.sum]
+# Source node to ATen node mapping:
+#   hidden_states => convert_element_type
+#   hidden_states_1 => mul_23
+#   to_1 => convert_element_type_1
+# Graph fragment:
+#   %convert_element_type : [num_users=3] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%primals_4, torch.float32), kwargs = {})
+#   %mul_23 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type, %rsqrt), kwargs = {})
+#   %convert_element_type_1 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_23, torch.bfloat16), kwargs = {})
+#   %mul_38 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%tangents_1, %convert_element_type_1), kwargs = {})
+#   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_38, [0, 1, 2], True), kwargs = {})
+triton_red_fused__to_copy_mul_sum_1 = async_compile.triton('triton_red_fused__to_copy_mul_sum_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.reduction(
+    size_hints={'x': 128, 'r0_': 512},
+    reduction_hint=ReductionHint.OUTER_TINY,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr', 'R0_BLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_mul_sum_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_red_fused__to_copy_mul_sum_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
+    xnumel = 128
+    r0_numel = 320
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_base = tl.arange(0, R0_BLOCK)[None, :]
+    rbase = r0_base
+    x0 = xindex
+    _tmp2 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
+    for r0_offset in range(0, r0_numel, R0_BLOCK):
+        r0_index = r0_offset + r0_base
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        tmp0 = tl.load(in_ptr0 + (x0 + 128*r0_1), xmask & r0_mask, eviction_policy='evict_first', other=0.0)
+        tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
+        tmp3 = _tmp2 + tmp1
+        _tmp2 = tl.where(r0_mask & xmask, tmp3, _tmp2)
+    tmp2 = tl.sum(_tmp2, 1)[:, None]
+    tl.store(out_ptr0 + (x0), tmp2, xmask)
+''', device_str='cuda')
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/p3/cp3uyorjp57oo5jc6wsaegyfcdcbyu6gppqliksrkascc6kis3o2.py
+# Topologically Sorted Source Nodes: [hidden_states], Original ATen: [aten.mul, aten._to_copy, aten.sum, aten.div, aten.pow, aten.add]
+# Source node to ATen node mapping:
+#   hidden_states => convert_element_type
+# Graph fragment:
+#   %mul_37 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%tangents_1, %primals_5), kwargs = {})
+#   %convert_element_type : [num_users=3] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%primals_4, torch.float32), kwargs = {})
+#   %convert_element_type_2 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_37, torch.float32), kwargs = {})
+#   %mul_39 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_2, %convert_element_type), kwargs = {})
+#   %mul_40 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_2, %rsqrt), kwargs = {})
+#   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%mul_39, [3], True), kwargs = {})
+#   %div : [num_users=1] = call_function[target=torch.ops.aten.div.Scalar](args = (%expand, 128), kwargs = {})
+#   %pow_3 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type, 1.0), kwargs = {})
+#   %mul_43 : [num_users=1] = call_function[target=torch.ops.aten.mul.Scalar](args = (%pow_3, 2.0), kwargs = {})
+#   %mul_44 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%div, %mul_43), kwargs = {})
+#   %add_46 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_40, %mul_44), kwargs = {})
+#   %convert_element_type_3 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_46, torch.bfloat16), kwargs = {})
+triton_per_fused__to_copy_add_div_mul_pow_sum_2 = async_compile.triton('triton_per_fused__to_copy_add_div_mul_pow_sum_2', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_div_mul_pow_sum_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 4, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_add_div_mul_pow_sum_2(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 128
+    R0_BLOCK: tl.constexpr = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_1 + 128*x0), xmask, other=0.0).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (r0_1), None, eviction_policy='evict_last').to(tl.float32)
+    tmp4 = tl.load(in_ptr2 + (r0_1 + 128*x0), xmask, other=0.0).to(tl.float32)
+    tmp11 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+    tmp2 = tmp0 * tmp1
+    tmp3 = tmp2.to(tl.float32)
+    tmp5 = tmp4.to(tl.float32)
+    tmp6 = tmp3 * tmp5
+    tmp7 = tl.broadcast_to(tmp6, [XBLOCK, R0_BLOCK])
+    tmp9 = tl.where(xmask, tmp7, 0)
+    tmp10 = tl.sum(tmp9, 1)[:, None]
+    tmp12 = tmp3 * tmp11
+    tmp13 = -0.5
+    tmp14 = tmp10 * tmp13
+    tmp15 = tmp11 * tmp11
+    tmp16 = tmp15 * tmp11
+    tmp17 = tmp14 * tmp16
+    tmp18 = 0.0078125
+    tmp19 = tmp17 * tmp18
+    tmp20 = 2.0
+    tmp21 = tmp5 * tmp20
+    tmp22 = tmp19 * tmp21
+    tmp23 = tmp12 + tmp22
+    tmp24 = tmp23.to(tl.float32)
+    tl.store(out_ptr1 + (r0_1 + 128*x0), tmp24, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+def call(args):
+    primals_1, primals_2, primals_3, primals_4, primals_5, rsqrt, tangents_1 = args
+    args.clear()
+    s3 = primals_1
+    s4 = primals_2
+    s5 = primals_3
+    assert_size_stride(primals_4, (s3, s4, s5, 128), (128*s4*s5, 128*s5, 128, 1))
+    assert_size_stride(primals_5, (128, ), (1, ))
+    assert_size_stride(rsqrt, (s3, s4, s5, 1), (s4*s5, s5, 1, 1))
+    assert_size_stride(tangents_1, (s3, s4, s5, 128), (128*s4*s5, 128*s5, 128, 1))
+    with torch.cuda._DeviceGuard(0):
+        torch.cuda.set_device(0)
+        buf0 = empty_strided_cuda((1, 1, 1, 128, 320), (40960, 40960, 40960, 1, 128), torch.float32)
+        # Topologically Sorted Source Nodes: [hidden_states, hidden_states_1, to_1], Original ATen: [aten._to_copy, aten.mul, aten.sum]
+        triton_red_fused__to_copy_mul_sum_0_r0_numel = (319 + s3*s4*s5) // 320
+        stream0 = get_raw_stream(0)
+        triton_red_fused__to_copy_mul_sum_0.run(tangents_1, primals_4, rsqrt, buf0, s3, s4, s5, 40960, triton_red_fused__to_copy_mul_sum_0_r0_numel, stream=stream0)
+        buf1 = empty_strided_cuda((1, 1, 1, 128), (128, 128, 128, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [hidden_states, hidden_states_1, to_1], Original ATen: [aten._to_copy, aten.mul, aten.sum]
+        stream0 = get_raw_stream(0)
+        triton_red_fused__to_copy_mul_sum_1.run(buf0, buf1, 128, 320, stream=stream0)
+        del buf0
+        buf3 = empty_strided_cuda((s3, s4, s5, 128), (128*s4*s5, 128*s5, 128, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [hidden_states], Original ATen: [aten.mul, aten._to_copy, aten.sum, aten.div, aten.pow, aten.add]
+        triton_per_fused__to_copy_add_div_mul_pow_sum_2_xnumel = s3*s4*s5
+        stream0 = get_raw_stream(0)
+        triton_per_fused__to_copy_add_div_mul_pow_sum_2.run(tangents_1, primals_5, primals_4, rsqrt, buf3, triton_per_fused__to_copy_add_div_mul_pow_sum_2_xnumel, 128, stream=stream0)
+        del primals_4
+        del primals_5
+        del rsqrt
+        del tangents_1
+    return (None, None, None, buf3, reinterpret_tensor(buf1, (128, ), (1, ), 0), )
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    primals_1 = 8
+    primals_2 = 1000
+    primals_3 = 16
+    primals_4 = rand_strided((8, 1000, 16, 128), (2048000, 2048, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    primals_5 = rand_strided((128, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
+    rsqrt = rand_strided((8, 1000, 16, 1), (16000, 16, 1, 1), device='cuda:0', dtype=torch.float32)
+    tangents_1 = rand_strided((8, 1000, 16, 128), (2048000, 2048, 128, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([primals_1, primals_2, primals_3, primals_4, primals_5, rsqrt, tangents_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor_ch-epfl-345354-j/6j/c6jqjdux4scc3alxlsrcpnhemegj7ym5pw3twg6xb2eyx4codkvz.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), xmask).to(tl.float32)
+    tmp7 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp2 = tmp1.to(tl.float32)
+    tmp3 = tl.sigmoid(tmp2)
+    tmp4 = tmp2 * tmp3
+    tmp5 = tmp4.to(tl.float32)
+    tmp6 = tmp0 * tmp5
+    tmp8 = tmp0 * tmp7
+    tmp9 = tl.sigmoid(tmp1)
+    tmp10 = 1.0
+    tmp11 = tmp10 - tmp9
+    tmp12 = tmp1 * tmp11
+    tmp13 = tmp12 + tmp10
+    tmp14 = tmp9 * tmp13
+    tmp15 = tmp8 * tmp14
+    tl.store(out_ptr0 + (x0), tmp6, xmask)
+    tl.store(in_out_ptr0 + (x0), tmp15, xmask)

torchinductor_ch-epfl-345354-j/7a/c7a4b5izank2343xz4473c4igojrrhlfxb5ulctqd32qrtkreq3m.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 268435456},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*i1', 'in_ptr2': '*fp32', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'in_ptr5': '*fp32', 'in_ptr6': '*fp32', 'out_ptr0': '*bf16', 'ks0': 'i32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_7', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 7, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused__log_softmax__log_softmax_backward_data__to_copy_nll_loss_backward_nll_loss_forward_7(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, out_ptr0, ks0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x2 = xindex
+    x1 = xindex // ks0
+    tmp0 = tl.load(in_ptr0 + (x2), xmask, eviction_policy='evict_last')
+    tmp1 = tl.load(in_ptr1 + (x1), xmask, eviction_policy='evict_last').to(tl.int1)
+    tmp2 = tl.load(in_ptr2 + (0))
+    tmp3 = tl.broadcast_to(tmp2, [XBLOCK])
+    tmp7 = tl.load(in_ptr3 + (x2), xmask, eviction_policy='evict_last').to(tl.float32)
+    tmp9 = tl.load(in_ptr4 + (x1), xmask, eviction_policy='evict_last')
+    tmp11 = tl.load(in_ptr5 + (x1), xmask, eviction_policy='evict_last')
+    tmp14 = tl.load(in_ptr6 + (x1), xmask, eviction_policy='evict_last')
+    tmp4 = 0.0
+    tmp5 = tl.where(tmp1, tmp3, tmp4)
+    tmp6 = tmp0 * tmp5
+    tmp8 = tmp7.to(tl.float32)
+    tmp10 = tmp8 - tmp9
+    tmp12 = tmp10 - tmp11
+    tmp13 = tl_math.exp(tmp12)
+    tmp15 = tmp13 * tmp14
+    tmp16 = tmp6 - tmp15
+    tmp17 = tmp16.to(tl.float32)
+    tl.store(out_ptr0 + (x2), tmp17, xmask)

torchinductor_ch-epfl-345354-j/7a/ddaeab32a9175f6d14ae7329f3defe09537605c41fa4d35da5bf9cbac1616b91.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 512, "num_warps": 8, "num_stages": 1, "configs_hash": "3ca5c3e34d35093f3c9ab2829a9faeebad5e61c4ca13d5ed6053d7b71ce60d5a", "found_by_coordesc": false, "time_taken_ms": 85}

torchinductor_ch-epfl-345354-j/7q/c7qudnwq7tyfwnepjsm2ilmratxdwkx4euvow7brbvrfif7hgnwh.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+Compile-time auto-tuning block:
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import preserve_rng_state
+from torch._inductor.select_algorithm import AlgorithmSelectorCache
+from torch._inductor.async_compile import AsyncCompile
+async_compile = AsyncCompile()
+generate_example_value = AlgorithmSelectorCache.generate_example_value
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0 = async_compile.triton('triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), xmask).to(tl.float32)
+    tmp7 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp2 = tmp1.to(tl.float32)
+    tmp3 = tl.sigmoid(tmp2)
+    tmp4 = tmp2 * tmp3
+    tmp5 = tmp4.to(tl.float32)
+    tmp6 = tmp0 * tmp5
+    tmp8 = tmp0 * tmp7
+    tmp9 = tl.sigmoid(tmp1)
+    tmp10 = 1.0
+    tmp11 = tmp10 - tmp9
+    tmp12 = tmp1 * tmp11
+    tmp13 = tmp12 + tmp10
+    tmp14 = tmp9 * tmp13
+    tmp15 = tmp8 * tmp14
+    tl.store(out_ptr0 + (x0), tmp6, xmask)
+    tl.store(in_out_ptr0 + (x0), tmp15, xmask)
+''', device_str='cuda')
+triton_poi_fused_add_1 = async_compile.triton('triton_poi_fused_add_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_1', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_1(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp2 = tmp0 + tmp1
+    tl.store(in_out_ptr0 + (x0), tmp2, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+with torch.cuda._DeviceGuard(0):
+    torch.cuda.set_device(0)
+    stream0 = get_raw_stream(0)
+    from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+    stream0 = get_raw_stream(0)
+    buf5 = generate_example_value((8, 1000, 3072), (3072000, 3072, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 3072))
+    buf1 = generate_example_value((8000, 3072), (3072, 1), 'cuda:0', torch.bfloat16, 0, (8000, 3072))
+    mm = generate_example_value((8000, 3072), (3072, 1), 'cuda:0', torch.bfloat16, 0, (8000, 3072))
+    buf2 = generate_example_value((8, 1000, 3072), (3072000, 3072, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 3072))
+    triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0.run(buf5, buf1, mm, buf2, 24576000, stream=stream0)
+    del buf5, buf1, mm, buf2
+    stream0 = get_raw_stream(0)
+    buf8 = generate_example_value((8, 1000, 1024), (1024000, 1024, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 1024))
+    buf7 = generate_example_value((8000, 1024), (1024, 1), 'cuda:0', torch.bfloat16, 0, (8000, 1024))
+    triton_poi_fused_add_1.run(buf8, buf7, 8192000, stream=stream0)
+    del buf8, buf7
+"""
+# AOT ID: ['11_backward']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/6j/c6jqjdux4scc3alxlsrcpnhemegj7ym5pw3twg6xb2eyx4codkvz.py
+# Topologically Sorted Source Nodes: [silu], Original ATen: [aten.silu, aten.mul, aten.sigmoid, aten.fill, aten.sub, aten.add]
+# Source node to ATen node mapping:
+#   silu => convert_element_type_2, convert_element_type_3, mul_19, sigmoid
+# Graph fragment:
+#   %convert_element_type_2 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_1, torch.float32), kwargs = {})
+#   %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_2,), kwargs = {})
+#   %mul_19 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_2, %sigmoid), kwargs = {})
+#   %convert_element_type_3 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_19, torch.bfloat16), kwargs = {})
+#   %mul_61 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_7, %convert_element_type_3), kwargs = {})
+#   %mul_62 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_7, %view_3), kwargs = {})
+#   %sigmoid_1 : [num_users=2] = call_function[target=torch.ops.aten.sigmoid.default](args = (%view_1,), kwargs = {})
+#   %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([%primals_2, %primals_3, 3072], 1), kwargs = {dtype: torch.bfloat16, layout: torch.strided, device: cuda:0, pin_memory: False})
+#   %sub_16 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%full_default, %sigmoid_1), kwargs = {})
+#   %mul_63 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_1, %sub_16), kwargs = {})
+#   %add_38 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mul_63, 1), kwargs = {})
+#   %mul_64 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sigmoid_1, %add_38), kwargs = {})
+#   %mul_65 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_62, %mul_64), kwargs = {})
+triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0 = async_compile.triton('triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tl.load(in_ptr1 + (x0), xmask).to(tl.float32)
+    tmp7 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp2 = tmp1.to(tl.float32)
+    tmp3 = tl.sigmoid(tmp2)
+    tmp4 = tmp2 * tmp3
+    tmp5 = tmp4.to(tl.float32)
+    tmp6 = tmp0 * tmp5
+    tmp8 = tmp0 * tmp7
+    tmp9 = tl.sigmoid(tmp1)
+    tmp10 = 1.0
+    tmp11 = tmp10 - tmp9
+    tmp12 = tmp1 * tmp11
+    tmp13 = tmp12 + tmp10
+    tmp14 = tmp9 * tmp13
+    tmp15 = tmp8 * tmp14
+    tl.store(out_ptr0 + (x0), tmp6, xmask)
+    tl.store(in_out_ptr0 + (x0), tmp15, xmask)
+''', device_str='cuda')
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/cm/ccm7s7qxxaw3nofvzpftd6fqmd57jvnzfc74xrk54wxxhdnqddlo.py
+# Topologically Sorted Source Nodes: [], Original ATen: [aten.add]
+# Source node to ATen node mapping:
+# Graph fragment:
+#   %add_39 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_9, %view_11), kwargs = {})
+triton_poi_fused_add_1 = async_compile.triton('triton_poi_fused_add_1', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 8388608},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_add_1', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_add_1(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp2 = tmp0 + tmp1
+    tl.store(in_out_ptr0 + (x0), tmp2, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+def call(args):
+    primals_2, primals_3, mul, view, mm, mm_1, view_4, permute_5, permute_9, permute_14, tangents_1 = args
+    args.clear()
+    s0 = primals_2
+    s1 = primals_3
+    assert_size_stride(view, (s0*s1, 1024), (1024, 1))
+    assert_size_stride(mm, (s0*s1, 3072), (3072, 1))
+    assert_size_stride(mm_1, (s0*s1, 3072), (3072, 1))
+    assert_size_stride(view_4, (s0*s1, 3072), (3072, 1))
+    assert_size_stride(permute_5, (1024, 3072), (3072, 1))
+    assert_size_stride(permute_9, (3072, 1024), (1024, 1))
+    assert_size_stride(permute_14, (3072, 1024), (1024, 1))
+    assert_size_stride(tangents_1, (s0, s1, 1024), (1024*s1, 1024, 1))
+    with torch.cuda._DeviceGuard(0):
+        torch.cuda.set_device(0)
+        buf0 = empty_strided_cuda((1024, 3072), (3072, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(tangents_1, (1024, s0*s1), (1, 1024), 0), view_4, out=buf0)
+        del view_4
+        buf1 = empty_strided_cuda((s0*s1, 3072), (3072, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(tangents_1, (s0*s1, 1024), (1024, 1), 0), permute_5, out=buf1)
+        del permute_5
+        del tangents_1
+        buf2 = empty_strided_cuda((s0, s1, 3072), (3072*s1, 3072, 1), torch.bfloat16)
+        buf5 = reinterpret_tensor(mm_1, (s0, s1, 3072), (3072*s1, 3072, 1), 0); del mm_1  # reuse
+        # Topologically Sorted Source Nodes: [silu], Original ATen: [aten.silu, aten.mul, aten.sigmoid, aten.fill, aten.sub, aten.add]
+        triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0_xnumel = 3072*s0*s1
+        stream0 = get_raw_stream(0)
+        triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0.run(buf5, buf1, mm, buf2, triton_poi_fused_add_fill_mul_sigmoid_silu_sub_0_xnumel, stream=stream0)
+        del buf1
+        del mm
+        buf3 = empty_strided_cuda((3072, 1024), (1024, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(buf2, (3072, s0*s1), (1, 3072), 0), view, out=buf3)
+        buf4 = empty_strided_cuda((s0*s1, 1024), (1024, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(buf2, (s0*s1, 3072), (3072, 1), 0), permute_9, out=buf4)
+        del buf2
+        del permute_9
+        buf6 = empty_strided_cuda((3072, 1024), (1024, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(buf5, (3072, s0*s1), (1, 3072), 0), view, out=buf6)
+        del view
+        buf7 = empty_strided_cuda((s0*s1, 1024), (1024, 1), torch.bfloat16)
+        # Topologically Sorted Source Nodes: [], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(buf5, (s0*s1, 3072), (3072, 1), 0), permute_14, out=buf7)
+        del buf5
+        del permute_14
+        buf8 = reinterpret_tensor(buf4, (s0, s1, 1024), (1024*s1, 1024, 1), 0); del buf4  # reuse
+        # Topologically Sorted Source Nodes: [], Original ATen: [aten.add]
+        triton_poi_fused_add_1_xnumel = 1024*s0*s1
+        stream0 = get_raw_stream(0)
+        triton_poi_fused_add_1.run(buf8, buf7, triton_poi_fused_add_1_xnumel, stream=stream0)
+        del buf7
+    return (buf6, None, None, buf8, buf3, buf0, )
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    primals_2 = 8
+    primals_3 = 1000
+    mul = 8000
+    view = rand_strided((8000, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    mm = rand_strided((8000, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
+    mm_1 = rand_strided((8000, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
+    view_4 = rand_strided((8000, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
+    permute_5 = rand_strided((1024, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
+    permute_9 = rand_strided((3072, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    permute_14 = rand_strided((3072, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    tangents_1 = rand_strided((8, 1000, 1024), (1024000, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([primals_2, primals_3, mul, view, mm, mm_1, view_4, permute_5, permute_9, permute_14, tangents_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor_ch-epfl-345354-j/a6/ca64rxymdowafnowfq53ckfynl3yei5mmfkeefu6f6xndlg3ukok.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Compile-time auto-tuning block:
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import preserve_rng_state
+from torch._inductor.select_algorithm import AlgorithmSelectorCache
+from torch._inductor.async_compile import AsyncCompile
+async_compile = AsyncCompile()
+generate_example_value = AlgorithmSelectorCache.generate_example_value
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+triton_poi_fused_mul_silu_0 = async_compile.triton('triton_poi_fused_mul_silu_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_0(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(in_out_ptr0 + (x0), tmp6, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+with torch.cuda._DeviceGuard(0):
+    torch.cuda.set_device(0)
+    stream0 = get_raw_stream(0)
+    from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+    stream0 = get_raw_stream(0)
+    buf2 = generate_example_value((8, 1000, 3072), (3072000, 3072, 1), 'cuda:0', torch.bfloat16, 0, (8, 1000, 3072))
+    buf1 = generate_example_value((8000, 3072), (3072, 1), 'cuda:0', torch.bfloat16, 0, (8000, 3072))
+    triton_poi_fused_mul_silu_0.run(buf2, buf1, 24576000, stream=stream0)
+    del buf2, buf1
+"""
+# AOT ID: ['5_inference']
+from ctypes import c_void_p, c_long, c_int
+import torch
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from cmath import nanj
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import start_graph, end_graph
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+async_compile = AsyncCompile()
+empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
+# kernel path: /tmp/torchinductor_ch-epfl-345354-j/57/c574kngiopy3pgespyoupnzlae4d5tokyeui7uglwglnym2qijvn.py
+# Topologically Sorted Source Nodes: [silu, mul], Original ATen: [aten.silu, aten.mul]
+# Source node to ATen node mapping:
+#   mul => mul_38
+#   silu => convert_element_type_2, convert_element_type_3, mul_19, sigmoid
+# Graph fragment:
+#   %convert_element_type_2 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view_1, torch.float32), kwargs = {})
+#   %sigmoid : [num_users=1] = call_function[target=torch.ops.aten.sigmoid.default](args = (%convert_element_type_2,), kwargs = {})
+#   %mul_19 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_2, %sigmoid), kwargs = {})
+#   %convert_element_type_3 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_19, torch.bfloat16), kwargs = {})
+#   %mul_38 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_3, %view_3), kwargs = {})
+triton_poi_fused_mul_silu_0 = async_compile.triton('triton_poi_fused_mul_silu_0', '''
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 33554432},
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_mul_silu_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_mul_silu_0(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x0 = xindex
+    tmp0 = tl.load(in_out_ptr0 + (x0), xmask).to(tl.float32)
+    tmp5 = tl.load(in_ptr0 + (x0), xmask).to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tl.sigmoid(tmp1)
+    tmp3 = tmp1 * tmp2
+    tmp4 = tmp3.to(tl.float32)
+    tmp6 = tmp4 * tmp5
+    tl.store(in_out_ptr0 + (x0), tmp6, xmask)
+''', device_str='cuda')
+async_compile.wait(globals())
+del async_compile
+def call(args):
+    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1 = args
+    args.clear()
+    s0 = arg1_1
+    s1 = arg2_1
+    assert_size_stride(arg0_1, (3072, 1024), (1024, 1))
+    assert_size_stride(arg3_1, (s0, s1, 1024), (1024*s1, 1024, 1))
+    assert_size_stride(arg4_1, (3072, 1024), (1024, 1))
+    assert_size_stride(arg5_1, (1024, 3072), (3072, 1))
+    with torch.cuda._DeviceGuard(0):
+        torch.cuda.set_device(0)
+        pool1 = empty_strided_cuda((s0*s1, 3072), (3072, 1), torch.bfloat16)
+        buf0 = pool1  # alloc
+        # Topologically Sorted Source Nodes: [linear], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(arg3_1, (s0*s1, 1024), (1024, 1), 0), reinterpret_tensor(arg0_1, (1024, 3072), (1, 1024), 0), out=buf0)
+        del arg0_1
+        pool2 = empty_strided_cuda((s0*s1, 3072), (3072, 1), torch.bfloat16)
+        buf1 = pool2  # alloc
+        # Topologically Sorted Source Nodes: [linear_1], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(arg3_1, (s0*s1, 1024), (1024, 1), 0), reinterpret_tensor(arg4_1, (1024, 3072), (1, 1024), 0), out=buf1)
+        del arg3_1
+        del arg4_1
+        buf2 = reinterpret_tensor(buf0, (s0, s1, 3072), (3072*s1, 3072, 1), 0);  # reuse
+        # Topologically Sorted Source Nodes: [silu, mul], Original ATen: [aten.silu, aten.mul]
+        triton_poi_fused_mul_silu_0_xnumel = 3072*s0*s1
+        stream0 = get_raw_stream(0)
+        triton_poi_fused_mul_silu_0.run(buf2, buf1, triton_poi_fused_mul_silu_0_xnumel, stream=stream0)
+        del pool2, buf1
+        pool0 = empty_strided_cuda((s0*s1, 1024), (1024, 1), torch.bfloat16)
+        buf3 = pool0  # alloc
+        # Topologically Sorted Source Nodes: [down_proj], Original ATen: [aten.mm]
+        extern_kernels.mm(reinterpret_tensor(buf2, (s0*s1, 3072), (3072, 1), 0), reinterpret_tensor(arg5_1, (3072, 1024), (1, 3072), 0), out=buf3)
+        del arg5_1
+        del pool1, buf0, buf2
+    return (reinterpret_tensor(buf3, (s0, s1, 1024), (1024*s1, 1024, 1), 0), )
+def benchmark_compiled_module(times=10, repeat=10):
+    from torch._dynamo.testing import rand_strided
+    from torch._inductor.utils import print_performance
+    arg0_1 = rand_strided((3072, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg1_1 = 8
+    arg2_1 = 1000
+    arg3_1 = rand_strided((8, 1000, 1024), (1024000, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg4_1 = rand_strided((3072, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
+    arg5_1 = rand_strided((1024, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
+    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1])
+    return print_performance(fn, times=times, repeat=repeat)
+if __name__ == "__main__":
+    from torch._inductor.wrapper_benchmark import compiled_module_main
+    compiled_module_main('None', benchmark_compiled_module)

torchinductor_ch-epfl-345354-j/aotautograd/acxk7xhb35e5myvrfk4m2smos5f3rwybegalnbqbgtl3ghlaw2vw/entry ADDED Viewed

Binary file (96.7 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/adw7o5w6jucvlwdu4mn3nk52nno5z3lt73pmvaksrn3cahxlwc5t/entry ADDED Viewed

Binary file (5.55 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/agm67xcx3b2ejeqf3t422b43zsalmtzgitagqmb4kcd76dzg2sr6/entry ADDED Viewed

Binary file (15.3 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/ahinqqlnserz457jqclv2vjeogmqix7jcrylpuhbc64kw4k3apfy/entry ADDED Viewed

Binary file (7.48 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/ahji7b2arusm47q6ox5itjvurtws6r6kls2kskgxfnc2rqm4ojdg/entry ADDED Viewed

Binary file (4.83 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/aibbpzcrlnv7lrbehiaaab4olrvijekv6m46vdzzqh3tbnvnl67m/entry ADDED Viewed

Binary file (26.4 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/aig3hpjgzj7f27hhdphh7ozndqiwpruhugzjsiwyog75fn4y3rbj/entry ADDED Viewed

Binary file (7.63 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/algm7vsngjdke6rmqon76peuppnhsp625k5d4zxnwgwdbdueo4ay/entry ADDED Viewed

Binary file (4.37 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/amtpnp6cq6z6ddoun3fwe4zemhgpsp5jicklj6cf3qzsd3xbdeps/entry ADDED Viewed

Binary file (4.65 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/aqskia64x2j4xks7dhp5cpq52le5j6js6ghxfhlvw7gfa6qr6stx/entry ADDED Viewed

Binary file (16.2 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/aub73aicaqeihl4qdqbrvljzl2qxdzyu52zezeket676qt3pkgwk/entry ADDED Viewed

Binary file (4.65 kB). View file

torchinductor_ch-epfl-345354-j/aotautograd/aypob3g4nwzt66m7ur252rhjjobqgnn4hvdhagr4474twkamikxg/entry ADDED Viewed

Binary file (18.9 kB). View file

torchinductor_ch-epfl-345354-j/bm/cbmn253c3hy77ciw3f6meqi4bsbiio5zhw7hra5np6k5jyjqetnp.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.pointwise(
+    size_hints={'x': 1},
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'in_ptr1': '*i64', 'out_ptr0': '*fp32', 'xnumel': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_div_0', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
+    min_elem_per_thread=0
+)
+@triton.jit
+def triton_poi_fused_div_0(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 1
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = tl.full([XBLOCK], True, tl.int1)
+    tmp0 = tl.load(in_ptr0 + (0))
+    tmp1 = tl.broadcast_to(tmp0, [XBLOCK])
+    tmp2 = tl.load(in_ptr1 + (0))
+    tmp3 = tl.broadcast_to(tmp2, [XBLOCK])
+    tmp4 = tmp3.to(tl.float32)
+    tmp5 = (tmp1 / tmp4)
+    tl.store(out_ptr0 + (tl.full([XBLOCK], 0, tl.int32)), tmp5, None)

torchinductor_ch-epfl-345354-j/ce/cceyvpztlniy45jdq6sxx7o44obzjinfuxgsvnlhcr3hjdvmek73.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 1024, 'r0_': 64},
+    reduction_hint=ReductionHint.OUTER,
+    filename=__file__,
+    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_mul_sum_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_mul_sum_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    xnumel = 1024
+    r0_numel = 40
+    R0_BLOCK: tl.constexpr = 64
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = r0_index < r0_numel
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (x0 + 1024*r0_1), xmask & r0_mask, other=0.0)
+    tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
+    tmp3 = tl.where(r0_mask & xmask, tmp1, 0)
+    tmp4 = tl.sum(tmp3, 1)[:, None]
+    tl.store(out_ptr0 + (x0), tmp4, xmask)

torchinductor_ch-epfl-345354-j/cf/ae1632ffa009afdc4d40d5477a8e2ffd544972ad9ddf0c636c451826b3219579.best_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"XBLOCK": 32, "num_warps": 8, "num_stages": 1, "configs_hash": "22b8c9e89632e6687ce26aaad980a76bbf5ee683fff317f3a6d7989c7528ff63", "found_by_coordesc": false, "time_taken_ms": 108}

torchinductor_ch-epfl-345354-j/cf/ccfnt2f53rlwauznvnabnitvjchbzg7at22w4x4fskqzmyirxuxq.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers, triton_heuristics
+from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+triton_helpers.set_driver_to_gpu()
+@triton_heuristics.persistent_reduction(
+    size_hints={'x': 131072, 'r0_': 128},
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=42, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+    inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 1, 'backend_hash': '8D9A40F96256AE993B0CB3DAC1136935BA540F7848683690590C84AF795CC5ED', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
+)
+@triton.jit
+def triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr):
+    r0_numel = 128
+    R0_BLOCK: tl.constexpr = 128
+    rnumel = r0_numel
+    RBLOCK: tl.constexpr = R0_BLOCK
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    r0_index = tl.arange(0, R0_BLOCK)[None, :]
+    r0_offset = 0
+    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
+    roffset = r0_offset
+    rindex = r0_index
+    r0_1 = r0_index
+    x0 = xindex
+    tmp0 = tl.load(in_ptr0 + (r0_1 + 128*x0), xmask, other=0.0).to(tl.float32)
+    tmp12 = tl.load(in_ptr1 + (r0_1), None, eviction_policy='evict_last').to(tl.float32)
+    tmp1 = tmp0.to(tl.float32)
+    tmp2 = tmp1 * tmp1
+    tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
+    tmp5 = tl.where(xmask, tmp3, 0)
+    tmp6 = tl.sum(tmp5, 1)[:, None]
+    tmp7 = 128.0
+    tmp8 = (tmp6 / tmp7)
+    tmp9 = 1e-06
+    tmp10 = tmp8 + tmp9
+    tmp11 = libdevice.rsqrt(tmp10)
+    tmp13 = tmp1 * tmp11
+    tmp14 = tmp13.to(tl.float32)
+    tmp15 = tmp12 * tmp14
+    tl.debug_barrier()
+    tl.store(in_out_ptr0 + (x0), tmp11, xmask)
+    tl.store(out_ptr0 + (r0_1 + 128*x0), tmp15, xmask)