diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Generated on:
- Linux x86_64 | Linux-5.10.244-240.970.amzn2.x86_64-x86_64-with-glibc2.35 + Linux x86_64 | Linux-6.12.53-69.119.amzn2023.x86_64-x86_64-with-glibc2.35
@@ -4106,7 +3888,7 @@ body[data-tool="eraser"] .main-content { ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.22s | Raw @@ -4122,16 +3904,16 @@ Cell: nv | 0.23s
-
Fri Oct 31 20:00:00 2025       
+
Mon Nov 10 21:57:39 2025       
 +-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
-|-----------------------------------------+------------------------+----------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   32C    P0            101W /  350W |       0MiB /  46068MiB |    100%      Default |
+| N/A   26C    P0             88W /  350W |       0MiB /  46068MiB |     22%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -4155,7 +3937,7 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.58s
+Cell: benchmark | 38.43s
  | 
 
 Raw
@@ -4234,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.068ms      1195.27%       1.068ms       1.068ms             1  
-                                            torch_eager        14.00%     388.140us        99.71%       2.764ms       2.764ms       0.000us         0.00%      90.528us      90.528us             1  
-                                              aten::mul         6.16%     170.676us        10.43%     289.217us      12.051us      46.911us        52.52%      46.911us       1.955us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.911us        52.52%      46.911us       1.955us            24  
-                                            aten::copy_         4.25%     117.935us        62.65%       1.737ms      96.500us      29.185us        32.68%      30.401us       1.689us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.561us        25.26%      22.561us       1.880us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.216us        14.80%      13.216us       1.101us            12  
-                                            aten::clone         1.62%      44.961us        61.78%       1.713ms     285.451us       0.000us         0.00%       7.840us       1.307us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us         7.42%       6.624us       1.104us             6  
-                                              aten::sub         1.59%      44.071us         2.54%      70.301us      11.717us       6.624us         7.42%       6.624us       1.104us             6  
-                                              aten::add         1.26%      34.801us         2.08%      57.721us       9.620us       6.592us         7.38%       6.592us       1.099us             6  
-                                Activity Buffer Request        53.17%       1.474ms        53.17%       1.474ms       1.474ms       1.216us         1.36%       1.216us       1.216us             1  
-                                    aten::empty_strided         2.35%      65.251us         2.35%      65.251us      10.875us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.98%      82.752us         2.98%      82.752us      13.792us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.05%      84.591us         4.03%     111.694us       4.654us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.98%      27.103us         0.98%      27.103us       1.129us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.29%     229.882us         8.29%     229.882us       4.789us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.29%       8.120us         0.29%       8.120us       8.120us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.315ms      1474.39%       1.315ms       1.315ms             1  
+                                            torch_eager         7.00%     401.548us        82.40%       4.729ms       4.729ms       0.000us         0.00%      90.432us      90.432us             1  
+                                              aten::mul         3.25%     186.430us         5.35%     307.044us      12.793us      46.943us        52.62%      46.943us       1.956us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.943us        52.62%      46.943us       1.956us            24  
+                                            aten::copy_         2.48%     142.261us        48.48%       2.782ms     154.576us      29.122us        32.64%      30.338us       1.685us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.433us        25.14%      22.433us       1.869us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.151us        14.74%      13.151us       1.096us            12  
+                                            aten::clone         0.88%      50.441us        59.65%       3.423ms     570.575us       0.000us         0.00%       7.905us       1.318us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.689us         7.50%       6.689us       1.115us             6  
+                                              aten::sub         0.82%      47.350us         1.28%      73.411us      12.235us       6.591us         7.39%       6.591us       1.098us             6  
+                                              aten::add         0.64%      36.811us         1.04%      59.601us       9.934us       6.560us         7.35%       6.560us       1.093us             6  
+                                Activity Buffer Request        39.92%       2.291ms        39.92%       2.291ms       2.291ms       1.216us         1.36%       1.216us       1.216us             1  
+                                    aten::empty_strided        16.52%     948.386us        16.52%     948.386us     158.064us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         1.38%      78.980us         1.38%      78.980us      13.163us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         1.46%      83.925us         1.86%     106.703us       4.446us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.40%      22.778us         0.40%      22.778us       0.949us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.66%     439.430us         7.66%     439.430us       9.155us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        17.60%       1.010ms        17.60%       1.010ms       1.010ms       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.772ms
-Self CUDA time total: 89.312us
+Self CPU time total: 5.740ms
+Self CUDA time total: 89.216us
 
 
 
@@ -4264,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     960.345us      1063.10%     960.345us     960.345us             1  
-                                            torch_eager        11.94%     304.272us        99.78%       2.543ms       2.543ms       0.000us         0.00%      91.454us      91.454us             1  
-                                              aten::mul         6.19%     157.625us        10.77%     274.398us      11.433us      47.776us        52.89%      47.776us       1.991us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.776us        52.89%      47.776us       1.991us            24  
-                                            aten::copy_         4.14%     105.392us        66.58%       1.697ms      94.258us      29.343us        32.48%      30.463us       1.692us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.559us        24.97%      22.559us       1.880us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.215us        14.63%      13.215us       1.101us            12  
-                                            aten::clone         0.97%      24.733us        63.76%       1.625ms     270.825us       0.000us         0.00%       7.904us       1.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.51%       6.784us       1.131us             6  
-                                              aten::add         1.23%      31.452us         2.12%      54.072us       9.012us       6.623us         7.33%       6.623us       1.104us             6  
-                                              aten::sub         1.53%      39.032us         2.55%      64.964us      10.827us       6.592us         7.30%       6.592us       1.099us             6  
-                                Activity Buffer Request        57.59%       1.468ms        57.59%       1.468ms       1.468ms       1.120us         1.24%       1.120us       1.120us             1  
-                                    aten::empty_strided         1.31%      33.410us         1.31%      33.410us       5.568us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.43%      61.963us         2.43%      61.963us      10.327us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.76%      70.222us         3.54%      90.271us       3.761us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.79%      20.049us         0.79%      20.049us       0.835us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.91%     226.937us         8.91%     226.937us       4.728us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.22%       5.590us         0.22%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     967.576us      1072.55%     967.576us     967.576us             1  
+                                            torch_eager        10.80%     301.919us        99.80%       2.790ms       2.790ms       0.000us         0.00%      91.365us      91.365us             1  
+                                              aten::mul         5.82%     162.824us         9.87%     275.997us      11.500us      47.523us        52.68%      47.523us       1.980us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.523us        52.68%      47.523us       1.980us            24  
+                                            aten::copy_         4.18%     116.751us        70.01%       1.957ms     108.723us      29.282us        32.46%      30.434us       1.691us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.497us        24.94%      22.497us       1.875us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.408us        14.86%      13.408us       1.117us            12  
+                                            aten::clone         0.79%      22.172us        66.92%       1.871ms     311.782us       0.000us         0.00%       7.937us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us         7.52%       6.785us       1.131us             6  
+                                              aten::add         1.23%      34.361us         2.02%      56.562us       9.427us       6.720us         7.45%       6.720us       1.120us             6  
+                                              aten::sub         1.36%      38.010us         2.19%      61.310us      10.218us       6.688us         7.41%       6.688us       1.115us             6  
+                                Activity Buffer Request        61.66%       1.724ms        61.66%       1.724ms       1.724ms       1.152us         1.28%       1.152us       1.152us             1  
+                                    aten::empty_strided         1.16%      32.541us         1.16%      32.541us       5.424us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         2.01%      56.260us         2.01%      56.260us       9.377us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.34%      65.363us         2.94%      82.214us       3.426us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.60%      16.851us         0.60%      16.851us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.84%     219.114us         7.84%     219.114us       4.565us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.580us         0.20%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.548ms
-Self CUDA time total: 90.334us
+Self CPU time total: 2.795ms
+Self CUDA time total: 90.213us
 
 
 
@@ -4294,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     989.616us      1051.23%     989.616us     989.616us             1  
-                                            torch_eager        12.09%     307.194us        99.76%       2.536ms       2.536ms       0.000us         0.00%      95.450us      95.450us             1  
-                                              aten::mul         6.35%     161.494us        11.09%     281.865us      11.744us      48.958us        52.01%      48.958us       2.040us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.958us        52.01%      48.958us       2.040us            24  
-                                            aten::copy_         4.30%     109.293us        66.10%       1.680ms      93.343us      30.814us        32.73%      32.125us       1.785us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.943us        24.37%      22.943us       1.912us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.367us        15.26%      14.367us       1.197us            12  
-                                            aten::clone         0.97%      24.599us        62.75%       1.595ms     265.823us       0.000us         0.00%       9.182us       1.530us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.871us         8.36%       7.871us       1.312us             6  
-                                              aten::add         1.20%      30.579us         2.08%      52.891us       8.815us       7.199us         7.65%       7.199us       1.200us             6  
-                                              aten::sub         1.49%      37.871us         2.53%      64.231us      10.705us       7.168us         7.61%       7.168us       1.195us             6  
-                                Activity Buffer Request        56.57%       1.438ms        56.57%       1.438ms       1.438ms       1.311us         1.39%       1.311us       1.311us             1  
-                                    aten::empty_strided         1.38%      35.041us         1.38%      35.041us       5.840us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         2.38%      60.441us         2.38%      60.441us      10.074us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.77%      70.298us         3.53%      89.841us       3.743us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.77%      19.543us         0.77%      19.543us       0.814us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.50%     241.544us         9.50%     241.544us       5.032us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.24%       6.100us         0.24%       6.100us       6.100us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     927.639us       987.31%     927.639us     927.639us             1  
+                                            torch_eager        10.07%     282.335us        99.80%       2.798ms       2.798ms       0.000us         0.00%      95.268us      95.268us             1  
+                                              aten::mul         5.75%     161.290us         9.68%     271.373us      11.307us      48.769us        51.91%      48.769us       2.032us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.769us        51.91%      48.769us       2.032us            24  
+                                            aten::copy_         3.66%     102.626us        71.21%       1.996ms     110.912us      30.720us        32.70%      32.032us       1.780us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.39%      22.912us       1.909us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.467us        15.40%      14.467us       1.206us            12  
+                                            aten::clone         0.79%      22.060us        68.41%       1.918ms     319.628us       0.000us         0.00%       9.120us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         8.31%       7.808us       1.301us             6  
+                                              aten::sub         1.36%      38.040us         2.18%      61.002us      10.167us       7.265us         7.73%       7.265us       1.211us             6  
+                                              aten::add         1.15%      32.220us         1.90%      53.280us       8.880us       7.202us         7.67%       7.202us       1.200us             6  
+                                Activity Buffer Request        63.51%       1.780ms        63.51%       1.780ms       1.780ms       1.312us         1.40%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.12%      31.490us         1.12%      31.490us       5.248us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         1.87%      52.452us         1.87%      52.452us       8.742us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.25%      63.104us         2.86%      80.042us       3.335us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.60%      16.938us         0.60%      16.938us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.67%     215.090us         7.67%     215.090us       4.481us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.20%       5.470us         0.20%       5.470us       5.470us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.542ms
-Self CUDA time total: 94.139us
+Self CPU time total: 2.803ms
+Self CUDA time total: 93.956us
 
 
 
@@ -4324,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     928.327us       916.02%     928.327us     928.327us             1  
-                                            torch_eager        12.51%     290.049us        99.77%       2.313ms       2.313ms       0.000us         0.00%     102.689us     102.689us             1  
-                                              aten::mul         6.36%     147.401us        11.12%     257.946us      10.748us      52.800us        52.10%      52.800us       2.200us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.800us        52.10%      52.800us       2.200us            24  
-                                            aten::copy_         4.62%     107.204us        65.04%       1.508ms      83.777us      32.415us        31.99%      33.760us       1.876us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.511us        24.19%      24.511us       2.043us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.129us        15.92%      16.129us       1.344us            12  
-                                            aten::clone         0.98%      22.822us        61.74%       1.431ms     238.579us       0.000us         0.00%       9.249us       1.542us             6  
-                                              aten::add         1.37%      31.668us         2.34%      54.320us       9.053us       8.096us         7.99%       8.096us       1.349us             6  
-                                              aten::sub         1.57%      36.291us         2.61%      60.431us      10.072us       8.033us         7.93%       8.033us       1.339us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         7.80%       7.904us       1.317us             6  
-                                Activity Buffer Request        46.02%       1.067ms        46.02%       1.067ms       1.067ms       1.345us         1.33%       1.345us       1.345us             1  
-                                    aten::empty_strided         1.38%      31.940us         1.38%      31.940us       5.323us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.71%     271.508us        11.71%     271.508us      45.251us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.99%      69.429us         3.79%      87.781us       3.658us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.79%      18.352us         0.79%      18.352us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         9.47%     219.548us         9.47%     219.548us       4.574us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.23%       5.380us         0.23%       5.380us       5.380us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.847us       904.69%     918.847us     918.847us             1  
+                                            torch_eager        11.08%     278.185us        99.79%       2.506ms       2.506ms       0.000us         0.00%     102.877us     102.877us             1  
+                                              aten::mul         6.15%     154.372us        10.54%     264.762us      11.032us      52.638us        51.83%      52.638us       2.193us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.638us        51.83%      52.638us       2.193us            24  
+                                            aten::copy_         4.16%     104.580us        68.26%       1.714ms      95.219us      32.416us        31.92%      33.728us       1.874us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        24.26%      24.641us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.511us        16.26%      16.511us       1.376us            12  
+                                            aten::clone         0.84%      21.090us        65.15%       1.636ms     272.671us       0.000us         0.00%       9.087us       1.514us             6  
+                                              aten::sub         1.51%      38.031us         2.44%      61.190us      10.198us       8.288us         8.16%       8.288us       1.381us             6  
+                                              aten::add         1.29%      32.470us         2.19%      54.880us       9.147us       8.223us         8.10%       8.223us       1.371us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.66%       7.775us       1.296us             6  
+                                Activity Buffer Request        52.27%       1.312ms        52.27%       1.312ms       1.312ms       1.312us         1.29%       1.312us       1.312us             1  
+                                    aten::empty_strided         1.29%      32.302us         1.29%      32.302us       5.384us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         9.44%     236.943us         9.44%     236.943us      39.491us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.53%      63.496us         3.16%      79.393us       3.308us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.63%      15.897us         0.63%      15.897us       0.662us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         8.60%     215.892us         8.60%     215.892us       4.498us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.21%       5.340us         0.21%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.319ms
-Self CUDA time total: 101.344us
+Self CPU time total: 2.511ms
+Self CUDA time total: 101.565us
 
 
 
@@ -4354,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.018ms      1082.59%       1.018ms       1.018ms             1  
-                                            torch_eager        11.47%     329.955us        99.81%       2.870ms       2.870ms       0.000us         0.00%      95.358us      95.358us             1  
-                                              aten::mul         5.65%     162.614us         9.86%     283.677us      11.820us      49.056us        52.16%      49.056us       2.044us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.056us        52.16%      49.056us       2.044us            24  
-                                            aten::copy_         3.88%     111.664us        68.17%       1.960ms     108.907us      30.720us        32.66%      32.032us       1.780us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        24.33%      22.880us       1.907us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.270us        15.17%      14.270us       1.189us            12  
-                                            aten::clone         1.07%      30.831us        65.73%       1.890ms     315.021us       0.000us         0.00%       9.152us       1.525us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us         8.34%       7.840us       1.307us             6  
-                                              aten::add         1.15%      33.191us         2.07%      59.441us       9.907us       7.167us         7.62%       7.167us       1.194us             6  
-                                              aten::sub         1.59%      45.863us         2.59%      74.463us      12.411us       7.103us         7.55%       7.103us       1.184us             6  
-                                Activity Buffer Request        50.07%       1.440ms        50.07%       1.440ms       1.440ms       1.312us         1.40%       1.312us       1.312us             1  
-                                    aten::empty_strided         1.26%      36.310us         1.26%      36.310us       6.052us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.96%     343.839us        11.96%     343.839us      57.306us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.64%      75.860us         3.31%      95.264us       3.969us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.67%      19.404us         0.67%      19.404us       0.809us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.38%     240.995us         8.38%     240.995us       5.021us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.330us         0.19%       5.330us       5.330us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     991.709us      1060.94%     991.709us     991.709us             1  
+                                            torch_eager        10.56%     336.649us        99.82%       3.183ms       3.183ms       0.000us         0.00%      94.755us      94.755us             1  
+                                              aten::mul         5.20%     165.794us         8.73%     278.295us      11.596us      48.674us        52.07%      48.674us       2.028us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.674us        52.07%      48.674us       2.028us            24  
+                                            aten::copy_         3.76%     119.863us        72.07%       2.298ms     127.674us      30.622us        32.76%      31.902us       1.772us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.878us        24.47%      22.878us       1.907us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.179us        15.17%      14.179us       1.182us            12  
+                                            aten::clone         0.88%      28.161us        69.55%       2.218ms     369.616us       0.000us         0.00%       9.024us       1.504us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         8.28%       7.744us       1.291us             6  
+                                              aten::sub         1.28%      40.920us         2.05%      65.511us      10.918us       7.138us         7.64%       7.138us       1.190us             6  
+                                              aten::add         1.05%      33.330us         1.81%      57.620us       9.603us       7.041us         7.53%       7.041us       1.173us             6  
+                                Activity Buffer Request        55.60%       1.773ms        55.60%       1.773ms       1.773ms       1.280us         1.37%       1.280us       1.280us             1  
+                                    aten::empty_strided         1.06%      33.640us         1.06%      33.640us       5.607us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.74%     342.585us        10.74%     342.585us      57.097us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.14%      68.349us         2.66%      84.959us       3.540us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.52%      16.610us         0.52%      16.610us       0.692us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.03%     224.072us         7.03%     224.072us       4.668us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.590us         0.18%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.876ms
-Self CUDA time total: 94.046us
+Self CPU time total: 3.189ms
+Self CUDA time total: 93.475us
 
 
 
@@ -4384,27 +4166,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     913.335us       900.40%     913.335us     913.335us             1  
-                                            torch_eager        10.58%     290.726us        99.81%       2.742ms       2.742ms       0.000us         0.00%     102.781us     102.781us             1  
-                                              aten::mul         5.30%     145.663us         9.31%     255.637us      10.652us      52.735us        51.99%      52.735us       2.197us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.735us        51.99%      52.735us       2.197us            24  
-                                            aten::copy_         3.74%     102.751us        70.53%       1.937ms     107.622us      32.638us        32.18%      33.982us       1.888us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.735us        24.38%      24.735us       2.061us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.064us        15.84%      16.064us       1.339us            12  
-                                            aten::clone         0.88%      24.121us        67.96%       1.867ms     311.110us       0.000us         0.00%       9.247us       1.541us             6  
-                                              aten::sub         1.29%      35.411us         2.16%      59.202us       9.867us       8.033us         7.92%       8.033us       1.339us             6  
-                                              aten::add         1.13%      30.931us         1.93%      52.952us       8.825us       8.031us         7.92%       8.031us       1.339us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.903us         7.79%       7.903us       1.317us             6  
-                                Activity Buffer Request        52.85%       1.452ms        52.85%       1.452ms       1.452ms       1.344us         1.32%       1.344us       1.344us             1  
-                                    aten::empty_strided         1.21%      33.351us         1.21%      33.351us       5.559us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.71%     321.577us        11.71%     321.577us      53.596us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.55%      69.990us         3.22%      88.522us       3.688us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.67%      18.532us         0.67%      18.532us       0.772us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.90%     216.969us         7.90%     216.969us       4.520us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.19%       5.091us         0.19%       5.091us       5.091us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     941.177us       926.36%     941.177us     941.177us             1  
+                                            torch_eager         9.56%     295.804us        99.83%       3.088ms       3.088ms       0.000us         0.00%     102.911us     102.911us             1  
+                                              aten::mul         5.03%     155.643us         8.60%     265.986us      11.083us      52.802us        51.97%      52.802us       2.200us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.802us        51.97%      52.802us       2.200us            24  
+                                            aten::copy_         3.66%     113.330us        73.34%       2.269ms     126.052us      32.447us        31.94%      33.759us       1.876us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        24.31%      24.703us       2.059us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.350us        16.09%      16.350us       1.363us            12  
+                                            aten::clone         0.71%      21.820us        70.53%       2.182ms     363.694us       0.000us         0.00%       9.056us       1.509us             6  
+                                              aten::sub         1.30%      40.120us         2.07%      63.950us      10.658us       8.223us         8.09%       8.223us       1.370us             6  
+                                              aten::add         1.17%      36.201us         1.90%      58.931us       9.822us       8.127us         8.00%       8.127us       1.355us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.62%       7.744us       1.291us             6  
+                                Activity Buffer Request        57.23%       1.771ms        57.23%       1.771ms       1.771ms       1.312us         1.29%       1.312us       1.312us             1  
+                                    aten::empty_strided         0.98%      30.371us         0.98%      30.371us       5.062us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.40%     321.885us        10.40%     321.885us      53.647us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.12%      65.592us         2.67%      82.622us       3.443us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.55%      17.030us         0.55%      17.030us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.11%     219.985us         7.11%     219.985us       4.583us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.17%       5.340us         0.17%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.747ms
-Self CUDA time total: 101.437us
+Self CPU time total: 3.094ms
+Self CUDA time total: 101.599us
 
 
 
@@ -4414,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     929.433us       768.61%     929.433us     929.433us             1  
-                                            torch_eager        10.84%     297.701us        99.80%       2.742ms       2.742ms       0.000us         0.00%     122.716us     122.716us             1  
-                                              aten::mul         5.42%     148.850us         9.41%     258.632us      10.776us      62.014us        51.28%      62.014us       2.584us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.014us        51.28%      62.014us       2.584us            24  
-                                            aten::copy_         3.77%     103.682us        70.14%       1.927ms     107.043us      39.328us        32.52%      41.120us       2.284us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.82%      28.800us       2.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.582us        16.19%      19.582us       1.632us            12  
-                                            aten::clone         0.88%      24.131us        67.45%       1.853ms     308.828us       0.000us         0.00%      12.320us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.71%      10.528us       1.755us             6  
-                                              aten::sub         1.29%      35.482us         2.16%      59.433us       9.905us       9.792us         8.10%       9.792us       1.632us             6  
-                                              aten::add         1.13%      31.104us         1.94%      53.172us       8.862us       9.790us         8.10%       9.790us       1.632us             6  
-                                Activity Buffer Request        52.94%       1.454ms        52.94%       1.454ms       1.454ms       1.792us         1.48%       1.792us       1.792us             1  
-                                    aten::empty_strided         1.18%      32.542us         1.18%      32.542us       5.424us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        11.19%     307.407us        11.19%     307.407us      51.235us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.56%      70.268us         3.25%      89.361us       3.723us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      19.093us         0.70%      19.093us       0.796us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         7.91%     217.262us         7.91%     217.262us       4.526us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.370us         0.20%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.963us       782.64%     943.963us     943.963us             1  
+                                            torch_eager         9.85%     301.136us        99.82%       3.051ms       3.051ms       0.000us         0.00%     122.468us     122.468us             1  
+                                              aten::mul         5.14%     157.189us         8.67%     264.988us      11.041us      61.985us        51.39%      61.985us       2.583us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.985us        51.39%      61.985us       2.583us            24  
+                                            aten::copy_         3.53%     107.981us        72.58%       2.218ms     123.247us      39.362us        32.64%      41.218us       2.290us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.802us        23.88%      28.802us       2.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.265us        15.97%      19.265us       1.605us            12  
+                                            aten::clone         0.97%      29.629us        70.14%       2.144ms     357.356us       0.000us         0.00%      12.416us       2.069us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us         8.76%      10.560us       1.760us             6  
+                                              aten::add         1.14%      34.930us         1.90%      58.161us       9.693us       9.633us         7.99%       9.633us       1.606us             6  
+                                              aten::sub         1.25%      38.210us         2.05%      62.510us      10.418us       9.632us         7.99%       9.632us       1.605us             6  
+                                Activity Buffer Request        57.00%       1.742ms        57.00%       1.742ms       1.742ms       1.856us         1.54%       1.856us       1.856us             1  
+                                    aten::empty_strided         1.01%      31.021us         1.01%      31.021us       5.170us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        10.03%     306.454us        10.03%     306.454us      51.076us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.23%      68.242us         2.79%      85.430us       3.560us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.56%      17.188us         0.56%      17.188us       0.716us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.10%     217.131us         7.10%     217.131us       4.524us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.390us         0.18%       5.390us       5.390us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.747ms
-Self CUDA time total: 120.924us
+Self CPU time total: 3.057ms
+Self CUDA time total: 120.612us
 
 
 
@@ -4444,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     942.082us       549.37%     942.082us     942.082us             1  
-                                            torch_eager        20.10%     308.752us        99.67%       1.531ms       1.531ms       0.000us         0.00%     174.365us     174.365us             1  
-                                              aten::mul         9.79%     150.414us        16.96%     260.516us      10.855us      89.056us        51.93%      89.056us       3.711us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.056us        51.93%      89.056us       3.711us            24  
-                                            aten::copy_         6.91%     106.224us        46.22%     710.060us      39.448us      57.503us        33.53%      60.383us       3.355us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.511us        23.62%      40.511us       3.376us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.926us        14.54%      24.926us       2.077us            12  
-                                            aten::clone         1.37%      21.029us        40.87%     627.796us     104.633us       0.000us         0.00%      19.872us       3.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.91%      16.992us       2.832us             6  
-                                              aten::sub         2.26%      34.730us         3.83%      58.781us       9.797us      12.479us         7.28%      12.479us       2.080us             6  
-                                              aten::add         2.00%      30.683us         3.45%      52.973us       8.829us      12.447us         7.26%      12.447us       2.075us             6  
-                                Activity Buffer Request        16.15%     248.056us        16.15%     248.056us     248.056us       2.880us         1.68%       2.880us       2.880us             1  
-                                    aten::empty_strided         2.04%      31.392us         2.04%      31.392us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        18.97%     291.479us        18.97%     291.479us      48.580us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.49%      68.986us         5.70%      87.586us       3.649us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.21%      18.600us         1.21%      18.600us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        14.37%     220.744us        14.37%     220.744us       4.599us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.33%       5.080us         0.33%       5.080us       5.080us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     928.245us       538.18%     928.245us     928.245us             1  
+                                            torch_eager        19.14%     292.425us        99.66%       1.523ms       1.523ms       0.000us         0.00%     175.325us     175.325us             1  
+                                              aten::mul        10.16%     155.270us        17.20%     262.742us      10.948us      89.630us        51.97%      89.630us       3.735us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.630us        51.97%      89.630us       3.735us            24  
+                                            aten::copy_         6.82%     104.170us        46.76%     714.441us      39.691us      57.920us        33.58%      60.768us       3.376us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.768us        23.64%      40.768us       3.397us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.927us        14.45%      24.927us       2.077us            12  
+                                            aten::clone         1.34%      20.471us        41.24%     630.180us     105.030us       0.000us         0.00%      20.000us       3.333us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.152us         9.94%      17.152us       2.859us             6  
+                                              aten::sub         2.56%      39.072us         4.07%      62.112us      10.352us      12.480us         7.24%      12.480us       2.080us             6  
+                                              aten::add         2.20%      33.610us         3.65%      55.810us       9.302us      12.447us         7.22%      12.447us       2.075us             6  
+                                Activity Buffer Request        16.69%     254.944us        16.69%     254.944us     254.944us       2.848us         1.65%       2.848us       2.848us             1  
+                                    aten::empty_strided         2.04%      31.181us         2.04%      31.181us       5.197us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        19.06%     291.294us        19.06%     291.294us      48.549us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.37%      66.700us         5.47%      83.522us       3.480us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.10%      16.822us         1.10%      16.822us       0.701us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.19%     216.745us        14.19%     216.745us       4.516us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.34%       5.240us         0.34%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.536ms
-Self CUDA time total: 171.485us
+Self CPU time total: 1.528ms
+Self CUDA time total: 172.477us
 
 
 
@@ -4474,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     906.096us       748.31%     906.096us     906.096us             1  
-                                            torch_eager        18.91%     280.775us        99.66%       1.480ms       1.480ms       0.000us         0.00%     122.910us     122.910us             1  
-                                              aten::mul        10.01%     148.664us        17.45%     259.167us      10.799us      62.174us        51.35%      62.174us       2.591us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.174us        51.35%      62.174us       2.591us            24  
-                                            aten::copy_         6.88%     102.100us        46.50%     690.526us      38.363us      39.392us        32.53%      41.216us       2.290us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.78%      28.800us       2.400us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.520us        16.12%      19.520us       1.627us            12  
-                                            aten::clone         1.45%      21.579us        41.36%     614.176us     102.363us       0.000us         0.00%      12.416us       2.069us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us         8.75%      10.592us       1.765us             6  
-                                              aten::sub         2.32%      34.432us         3.90%      57.973us       9.662us       9.760us         8.06%       9.760us       1.627us             6  
-                                              aten::add         2.12%      31.432us         3.61%      53.552us       8.925us       9.760us         8.06%       9.760us       1.627us             6  
-                                Activity Buffer Request        17.05%     253.136us        17.05%     253.136us     253.136us       1.824us         1.51%       1.824us       1.824us             1  
-                                    aten::empty_strided         2.06%      30.533us         2.06%      30.533us       5.089us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        18.50%     274.717us        18.50%     274.717us      45.786us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.53%      67.311us         5.78%      85.812us       3.575us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.25%      18.501us         1.25%      18.501us       0.771us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        14.60%     216.737us        14.60%     216.737us       4.515us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.34%       4.981us         0.34%       4.981us       4.981us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     923.899us       767.46%     923.899us     923.899us             1  
+                                            torch_eager        19.14%     287.798us        99.65%       1.499ms       1.499ms       0.000us         0.00%     122.144us     122.144us             1  
+                                              aten::mul        10.49%     157.698us        17.70%     266.255us      11.094us      61.982us        51.49%      61.982us       2.583us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.982us        51.49%      61.982us       2.583us            24  
+                                            aten::copy_         6.99%     105.118us        46.36%     697.187us      38.733us      39.264us        32.62%      41.024us       2.279us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.832us        23.95%      28.832us       2.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.138us        15.90%      19.138us       1.595us            12  
+                                            aten::clone         1.32%      19.822us        40.79%     613.519us     102.253us       0.000us         0.00%      12.192us       2.032us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.67%      10.432us       1.739us             6  
+                                              aten::sub         2.51%      37.801us         4.08%      61.341us      10.224us       9.570us         7.95%       9.570us       1.595us             6  
+                                              aten::add         2.16%      32.471us         3.63%      54.661us       9.110us       9.568us         7.95%       9.568us       1.595us             6  
+                                Activity Buffer Request        16.71%     251.314us        16.71%     251.314us     251.314us       1.760us         1.46%       1.760us       1.760us             1  
+                                    aten::empty_strided         2.00%      30.060us         2.00%      30.060us       5.010us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        18.58%     279.394us        18.58%     279.394us      46.566us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.31%      64.750us         5.43%      81.609us       3.400us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.12%      16.859us         1.12%      16.859us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.34%     215.648us        14.34%     215.648us       4.493us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.35%       5.220us         0.35%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.485ms
-Self CUDA time total: 121.086us
+Self CPU time total: 1.504ms
+Self CUDA time total: 120.384us
 
 
 
@@ -4504,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.294us       555.32%     954.294us     954.294us             1  
-                                            torch_eager        11.21%     307.269us        99.82%       2.735ms       2.735ms       0.000us         0.00%     174.694us     174.694us             1  
-                                              aten::mul         5.59%     153.258us         9.69%     265.580us      11.066us      89.476us        52.07%      89.476us       3.728us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.476us        52.07%      89.476us       3.728us            24  
-                                            aten::copy_         3.78%     103.631us        69.46%       1.903ms     105.735us      57.505us        33.46%      60.353us       3.353us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.545us        23.59%      40.545us       3.379us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.865us        14.47%      24.865us       2.072us            12  
-                                            aten::clone         0.89%      24.491us        66.72%       1.828ms     304.733us       0.000us         0.00%      19.808us       3.301us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us         9.87%      16.960us       2.827us             6  
-                                              aten::add         1.15%      31.480us         1.96%      53.761us       8.960us      12.448us         7.24%      12.448us       2.075us             6  
-                                              aten::sub         1.31%      35.801us         2.17%      59.462us       9.910us      12.417us         7.23%      12.417us       2.070us             6  
-                                Activity Buffer Request        53.91%       1.477ms        53.91%       1.477ms       1.477ms       2.848us         1.66%       2.848us       2.848us             1  
-                                    aten::empty_strided         1.13%      30.930us         1.13%      30.930us       5.155us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.51%     260.666us         9.51%     260.666us      43.444us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.58%      70.761us         3.30%      90.449us       3.769us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.72%      19.688us         0.72%      19.688us       0.820us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.03%     220.086us         8.03%     220.086us       4.585us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.18%       5.030us         0.18%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.259us       547.68%     943.259us     943.259us             1  
+                                            torch_eager         9.82%     293.988us        99.82%       2.988ms       2.988ms       0.000us         0.00%     175.075us     175.075us             1  
+                                              aten::mul         5.17%     154.631us         8.81%     263.742us      10.989us      89.536us        51.99%      89.536us       3.731us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.536us        51.99%      89.536us       3.731us            24  
+                                            aten::copy_         3.66%     109.570us        72.53%       2.171ms     120.590us      57.795us        33.56%      60.643us       3.369us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.835us        23.71%      40.835us       3.403us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.896us        14.46%      24.896us       2.075us            12  
+                                            aten::clone         0.74%      22.030us        69.74%       2.087ms     347.874us       0.000us         0.00%      19.808us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us         9.85%      16.960us       2.827us             6  
+                                              aten::add         1.10%      32.890us         1.87%      55.840us       9.307us      12.481us         7.25%      12.481us       2.080us             6  
+                                              aten::sub         1.28%      38.273us         2.11%      63.142us      10.524us      12.415us         7.21%      12.415us       2.069us             6  
+                                Activity Buffer Request        58.02%       1.736ms        58.02%       1.736ms       1.736ms       2.848us         1.65%       2.848us       2.848us             1  
+                                    aten::empty_strided         1.00%      30.050us         1.00%      30.050us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.83%     264.325us         8.83%     264.325us      44.054us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.36%      70.650us         2.95%      88.161us       3.673us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.59%      17.511us         0.59%      17.511us       0.730us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.26%     217.282us         7.26%     217.282us       4.527us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.289us         0.18%       5.289us       5.289us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.740ms
-Self CUDA time total: 171.846us
+Self CPU time total: 2.993ms
+Self CUDA time total: 172.227us
 
 
 
@@ -4534,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     917.943us       324.46%     917.943us     917.943us             1  
-                                            torch_eager        18.90%     277.703us        99.65%       1.464ms       1.464ms       0.000us         0.00%     301.376us     301.376us             1  
-                                              aten::mul         9.84%     144.586us        17.44%     256.139us      10.672us     132.736us        46.92%     132.736us       5.531us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.736us        46.92%     132.736us       5.531us            24  
-                                            aten::copy_         7.06%     103.765us        45.63%     670.307us      37.239us     109.119us        38.57%     127.583us       7.088us            18  
-                                            aten::clone         1.58%      23.262us        40.78%     599.096us      99.849us       0.000us         0.00%      70.336us      11.723us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.247us        20.23%      57.247us       4.771us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.872us        18.34%      51.872us       8.645us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.057us        14.51%      41.057us       3.421us            12  
-                                              aten::add         2.13%      31.271us         3.65%      53.632us       8.939us      20.545us         7.26%      20.545us       3.424us             6  
-                                              aten::sub         2.39%      35.109us         4.06%      59.711us       9.952us      20.512us         7.25%      20.512us       3.419us             6  
-                                Activity Buffer Request        16.07%     236.106us        16.07%     236.106us     236.106us      18.464us         6.53%      18.464us      18.464us             1  
-                                    aten::empty_strided         2.35%      34.500us         2.35%      34.500us       5.750us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        18.36%     269.767us        18.36%     269.767us      44.961us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.78%      70.183us         6.04%      88.753us       3.698us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.26%      18.570us         1.26%      18.570us       0.774us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        14.92%     219.185us        14.92%     219.185us       4.566us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       5.090us         0.35%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     922.006us       322.11%     922.006us     922.006us             1  
+                                            torch_eager        19.42%     278.764us        99.64%       1.431ms       1.431ms       0.000us         0.00%     304.543us     304.543us             1  
+                                              aten::mul        10.68%     153.400us        18.09%     259.803us      10.825us     134.112us        46.85%     134.112us       5.588us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.112us        46.85%     134.112us       5.588us            24  
+                                            aten::copy_         7.65%     109.831us        44.83%     643.670us      35.759us     111.232us        38.86%     129.536us       7.196us            18  
+                                            aten::clone         1.43%      20.539us        38.82%     557.349us      92.892us       0.000us         0.00%      72.160us      12.027us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.376us        20.04%      57.376us       4.781us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.856us        18.82%      53.856us       8.976us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.895us        14.29%      40.895us       3.408us            12  
+                                              aten::sub         2.68%      38.501us         4.30%      61.692us      10.282us      20.543us         7.18%      20.543us       3.424us             6  
+                                              aten::add         2.29%      32.829us         3.81%      54.730us       9.122us      20.352us         7.11%      20.352us       3.392us             6  
+                                Activity Buffer Request        16.08%     230.904us        16.08%     230.904us     230.904us      18.304us         6.39%      18.304us      18.304us             1  
+                                    aten::empty_strided         2.06%      29.601us         2.06%      29.601us       4.933us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        16.83%     241.674us        16.83%     241.674us      40.279us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.51%      64.754us         5.69%      81.743us       3.406us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.18%      16.989us         1.18%      16.989us       0.708us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        14.82%     212.756us        14.82%     212.756us       4.432us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.240us         0.36%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.469ms
-Self CUDA time total: 282.912us
+Self CPU time total: 1.436ms
+Self CUDA time total: 286.239us
 
 
 
@@ -4564,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.832us       165.35%     931.832us     931.832us             1  
-                                            torch_eager        19.27%     283.137us        99.64%       1.464ms       1.464ms       0.000us         0.00%     587.261us     587.261us             1  
-                                            aten::copy_         7.04%     103.435us        44.90%     659.587us      36.644us     272.511us        48.36%     296.223us      16.457us            18  
-                                              aten::mul        10.36%     152.225us        18.18%     267.110us      11.130us     224.829us        39.90%     224.829us       9.368us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     224.829us        39.90%     224.829us       9.368us            24  
-                                            aten::clone         1.47%      21.550us        39.53%     580.673us      96.779us       0.000us         0.00%     205.855us      34.309us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.143us        32.32%     182.143us      30.357us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.368us        16.04%      90.368us       7.531us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.209us        11.75%      66.209us       5.517us            12  
-                                              aten::sub         2.39%      35.041us         4.07%      59.831us       9.972us      33.760us         5.99%      33.760us       5.627us             6  
-                                              aten::add         2.15%      31.591us         3.70%      54.401us       9.067us      32.449us         5.76%      32.449us       5.408us             6  
-                                Activity Buffer Request        16.23%     238.406us        16.23%     238.406us     238.406us      23.712us         4.21%      23.712us      23.712us             1  
-                                    aten::empty_strided         2.04%      29.960us         2.04%      29.960us       4.993us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.39%     255.475us        17.39%     255.475us      42.579us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.73%      69.441us         6.00%      88.092us       3.670us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.27%      18.651us         1.27%      18.651us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.30%     224.756us        15.30%     224.756us       4.682us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       5.280us         0.36%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     970.352us       169.72%     970.352us     970.352us             1  
+                                            torch_eager        19.50%     289.365us        99.64%       1.478ms       1.478ms       0.000us         0.00%     595.480us     595.480us             1  
+                                            aten::copy_         7.05%     104.551us        43.31%     642.598us      35.700us     273.596us        47.85%     297.340us      16.519us            18  
+                                              aten::mul        11.63%     172.532us        19.46%     288.666us      12.028us     232.863us        40.73%     232.863us       9.703us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.863us        40.73%     232.863us       9.703us            24  
+                                            aten::clone         1.45%      21.521us        37.67%     558.878us      93.146us       0.000us         0.00%     205.949us      34.325us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.205us        31.87%     182.205us      30.367us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.391us        15.98%      91.391us       7.616us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.277us        11.42%      65.277us       5.440us            12  
+                                              aten::sub         2.70%      40.111us         4.36%      64.701us      10.784us      32.768us         5.73%      32.768us       5.461us             6  
+                                              aten::add         2.31%      34.320us         3.88%      57.510us       9.585us      32.509us         5.69%      32.509us       5.418us             6  
+                                Activity Buffer Request        17.48%     259.324us        17.48%     259.324us     259.324us      23.744us         4.15%      23.744us      23.744us             1  
+                                    aten::empty_strided         2.00%      29.720us         2.00%      29.720us       4.953us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        14.68%     217.742us        14.68%     217.742us      36.290us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.50%      66.694us         5.68%      84.252us       3.511us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.18%      17.558us         1.18%      17.558us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.16%     224.895us        15.16%     224.895us       4.685us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.36%       5.340us         0.36%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.469ms
-Self CUDA time total: 563.549us
+Self CPU time total: 1.484ms
+Self CUDA time total: 571.736us
 
 
 
@@ -4594,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     948.157us      1025.28%     948.157us     948.157us             1  
-                                            torch_eager        11.31%     303.890us        99.80%       2.681ms       2.681ms       0.000us         0.00%      93.597us      93.597us             1  
-                                              aten::mul         5.70%     153.152us         9.94%     267.009us      11.125us      49.696us        53.74%      49.696us       2.071us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.696us        53.74%      49.696us       2.071us            24  
-                                            aten::copy_         3.75%     100.883us        69.10%       1.857ms     103.143us      29.375us        31.76%      30.494us       1.694us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        24.43%      22.592us       1.883us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.407us        14.50%      13.407us       1.117us            12  
-                                            aten::clone         0.85%      22.792us        66.32%       1.782ms     296.986us       0.000us         0.00%       7.902us       1.317us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.783us         7.33%       6.783us       1.131us             6  
-                                              aten::sub         1.31%      35.191us         2.17%      58.341us       9.724us       6.720us         7.27%       6.720us       1.120us             6  
-                                              aten::add         1.15%      30.820us         1.98%      53.181us       8.863us       6.687us         7.23%       6.687us       1.114us             6  
-                                Activity Buffer Request        53.95%       1.449ms        53.95%       1.449ms       1.449ms       1.119us         1.21%       1.119us       1.119us             1  
-                                    aten::empty_strided         1.15%      30.830us         1.15%      30.830us       5.138us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         9.13%     245.326us         9.13%     245.326us      40.888us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.61%      70.171us         3.31%      88.830us       3.701us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.69%      18.659us         0.69%      18.659us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.20%     220.298us         8.20%     220.298us       4.590us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.250us         0.20%       5.250us       5.250us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.155us      1011.59%     936.155us     936.155us             1  
+                                            torch_eager         9.66%     281.404us        99.82%       2.908ms       2.908ms       0.000us         0.00%      93.663us      93.663us             1  
+                                              aten::mul         5.48%     159.764us         9.36%     272.564us      11.357us      49.568us        53.56%      49.568us       2.065us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.568us        53.56%      49.568us       2.065us            24  
+                                            aten::copy_         3.70%     107.711us        72.25%       2.105ms     116.944us      29.407us        31.78%      30.527us       1.696us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.591us        24.41%      22.591us       1.883us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.568us        14.66%      13.568us       1.131us            12  
+                                            aten::clone         0.74%      21.551us        69.34%       2.020ms     336.695us       0.000us         0.00%       7.936us       1.323us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us         7.37%       6.816us       1.136us             6  
+                                              aten::sub         1.31%      38.128us         2.13%      61.912us      10.319us       6.815us         7.36%       6.815us       1.136us             6  
+                                              aten::add         1.08%      31.450us         1.84%      53.600us       8.933us       6.753us         7.30%       6.753us       1.126us             6  
+                                Activity Buffer Request        59.75%       1.741ms        59.75%       1.741ms       1.741ms       1.120us         1.21%       1.120us       1.120us             1  
+                                    aten::empty_strided         1.04%      30.170us         1.04%      30.170us       5.028us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.73%     196.044us         6.73%     196.044us      32.674us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.24%      65.300us         2.82%      82.022us       3.418us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.57%      16.722us         0.57%      16.722us       0.697us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.53%     219.305us         7.53%     219.305us       4.569us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.160us         0.18%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.687ms
-Self CUDA time total: 92.478us
+Self CPU time total: 2.913ms
+Self CUDA time total: 92.543us
 
 
 
@@ -4624,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     924.823us       959.84%     924.823us     924.823us             1  
-                                            torch_eager        19.47%     279.525us        99.65%       1.430ms       1.430ms       0.000us         0.00%      97.664us      97.664us             1  
-                                              aten::mul        10.27%     147.364us        19.04%     273.370us      11.390us      51.165us        53.10%      51.165us       2.132us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.165us        53.10%      51.165us       2.132us            24  
-                                            aten::copy_         7.14%     102.519us        43.74%     627.869us      34.882us      30.913us        32.08%      32.225us       1.790us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        23.91%      23.040us       1.920us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.274us        14.81%      14.274us       1.189us            12  
-                                            aten::clone         1.45%      20.838us        38.33%     550.144us      91.691us       0.000us         0.00%       9.185us       1.531us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.873us         8.17%       7.873us       1.312us             6  
-                                              aten::add         2.18%      31.279us         3.75%      53.900us       8.983us       7.137us         7.41%       7.137us       1.189us             6  
-                                              aten::sub         2.45%      35.101us         4.11%      58.931us       9.822us       7.137us         7.41%       7.137us       1.189us             6  
-                                Activity Buffer Request        15.34%     220.215us        15.34%     220.215us     220.215us       1.312us         1.36%       1.312us       1.312us             1  
-                                    aten::empty_strided         2.15%      30.891us         2.15%      30.891us       5.148us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.11%     245.545us        17.11%     245.545us      40.924us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.62%      66.322us         5.93%      85.082us       3.545us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      18.760us         1.31%      18.760us       0.782us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        16.17%     232.047us        16.17%     232.047us       4.834us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       5.041us         0.35%       5.041us       5.041us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.262us       956.86%     918.262us     918.262us             1  
+                                            torch_eager        20.02%     274.163us        99.62%       1.364ms       1.364ms       0.000us         0.00%      97.279us      97.279us             1  
+                                              aten::mul        11.52%     157.766us        19.39%     265.646us      11.069us      51.167us        53.32%      51.167us       2.132us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.167us        53.32%      51.167us       2.132us            24  
+                                            aten::copy_         7.76%     106.268us        42.02%     575.576us      31.976us      30.720us        32.01%      32.033us       1.780us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        23.88%      22.912us       1.909us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.079us        14.67%      14.079us       1.173us            12  
+                                            aten::clone         1.48%      20.322us        36.02%     493.298us      82.216us       0.000us         0.00%       9.121us       1.520us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         8.14%       7.808us       1.301us             6  
+                                              aten::sub         2.81%      38.541us         4.49%      61.481us      10.247us       7.072us         7.37%       7.072us       1.179us             6  
+                                              aten::add         2.42%      33.131us         4.04%      55.302us       9.217us       7.007us         7.30%       7.007us       1.168us             6  
+                                Activity Buffer Request        16.17%     221.544us        16.17%     221.544us     221.544us       1.313us         1.37%       1.313us       1.313us             1  
+                                    aten::empty_strided         2.33%      31.950us         2.33%      31.950us       5.325us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.69%     187.513us        13.69%     187.513us      31.252us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.61%      63.101us         5.84%      79.961us       3.332us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.23%      16.860us         1.23%      16.860us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.57%     213.242us        15.57%     213.242us       4.443us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.270us         0.38%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.435ms
-Self CUDA time total: 96.352us
+Self CPU time total: 1.370ms
+Self CUDA time total: 95.966us
 
 
 
@@ -4654,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     915.886us       880.13%     915.886us     915.886us             1  
-                                            torch_eager        19.45%     278.057us        99.65%       1.425ms       1.425ms       0.000us         0.00%     105.374us     105.374us             1  
-                                              aten::mul        10.44%     149.250us        18.09%     258.645us      10.777us      55.325us        53.17%      55.325us       2.305us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.325us        53.17%      55.325us       2.305us            24  
-                                            aten::copy_         7.22%     103.283us        44.53%     636.707us      35.373us      32.575us        31.30%      33.887us       1.883us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        23.74%      24.703us       2.059us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.162us        15.53%      16.162us       1.347us            12  
-                                            aten::clone         1.49%      21.291us        38.97%     557.204us      92.867us       0.000us         0.00%       9.184us       1.531us             6  
-                                              aten::sub         2.42%      34.610us         4.09%      58.491us       9.749us       8.096us         7.78%       8.096us       1.349us             6  
-                                              aten::add         2.18%      31.210us         3.76%      53.710us       8.952us       8.066us         7.75%       8.066us       1.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.872us         7.56%       7.872us       1.312us             6  
-                                Activity Buffer Request        15.88%     227.005us        15.88%     227.005us     227.005us       1.312us         1.26%       1.312us       1.312us             1  
-                                    aten::empty_strided         2.12%      30.341us         2.12%      30.341us       5.057us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.11%     244.667us        17.11%     244.667us      40.778us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.81%      68.755us         6.12%      87.484us       3.645us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      18.729us         1.31%      18.729us       0.780us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.21%     217.528us        15.21%     217.528us       4.532us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       5.011us         0.35%       5.011us       5.011us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     929.528us       892.96%     929.528us     929.528us             1  
+                                            torch_eager        20.25%     278.528us        99.63%       1.370ms       1.370ms       0.000us         0.00%     105.439us     105.439us             1  
+                                              aten::mul        11.59%     159.422us        19.60%     269.583us      11.233us      55.326us        53.15%      55.326us       2.305us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.326us        53.15%      55.326us       2.305us            24  
+                                            aten::copy_         7.64%     105.130us        41.59%     572.021us      31.779us      32.351us        31.08%      33.695us       1.872us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.607us        23.64%      24.607us       2.051us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.418us        15.77%      16.418us       1.368us            12  
+                                            aten::clone         1.49%      20.431us        35.49%     488.057us      81.343us       0.000us         0.00%       9.088us       1.515us             6  
+                                              aten::sub         2.60%      35.723us         4.36%      59.953us       9.992us       8.258us         7.93%       8.258us       1.376us             6  
+                                              aten::add         2.46%      33.770us         4.07%      55.940us       9.323us       8.160us         7.84%       8.160us       1.360us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.44%       7.744us       1.291us             6  
+                                Activity Buffer Request        16.10%     221.454us        16.10%     221.454us     221.454us       1.344us         1.29%       1.344us       1.344us             1  
+                                    aten::empty_strided         2.25%      30.990us         2.25%      30.990us       5.165us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.30%     182.863us        13.30%     182.863us      30.477us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.81%      66.212us         6.02%      82.825us       3.451us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.21%      16.613us         1.21%      16.613us       0.692us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.93%     219.135us        15.93%     219.135us       4.565us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.37%       5.090us         0.37%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.430ms
-Self CUDA time total: 104.062us
+Self CPU time total: 1.375ms
+Self CUDA time total: 104.095us
 
 
 
@@ -4684,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     926.227us       747.17%     926.227us     926.227us             1  
-                                            torch_eager        10.87%     288.725us        99.79%       2.651ms       2.651ms       0.000us         0.00%     125.755us     125.755us             1  
-                                              aten::mul         5.66%     150.315us         9.84%     261.507us      10.896us      65.119us        52.53%      65.119us       2.713us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.119us        52.53%      65.119us       2.713us            24  
-                                            aten::copy_         3.77%     100.152us        69.45%       1.845ms     102.495us      39.455us        31.83%      41.246us       2.291us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.928us        23.34%      28.928us       2.411us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.390us        15.64%      19.390us       1.616us            12  
-                                            aten::clone         0.89%      23.522us        66.73%       1.773ms     295.426us       0.000us         0.00%      12.318us       2.053us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.527us         8.49%      10.527us       1.755us             6  
-                                              aten::add         1.16%      30.840us         2.00%      53.221us       8.870us       9.759us         7.87%       9.759us       1.626us             6  
-                                              aten::sub         1.31%      34.853us         2.22%      58.863us       9.811us       9.631us         7.77%       9.631us       1.605us             6  
-                                Activity Buffer Request        54.50%       1.448ms        54.50%       1.448ms       1.448ms       1.791us         1.44%       1.791us       1.791us             1  
-                                    aten::empty_strided         1.16%      30.740us         1.16%      30.740us       5.123us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.93%     237.245us         8.93%     237.245us      39.541us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.65%      70.502us         3.36%      89.223us       3.718us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.70%      18.721us         0.70%      18.721us       0.780us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.19%     217.516us         8.19%     217.516us       4.532us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.590us         0.21%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.134us       762.57%     943.134us     943.134us             1  
+                                            torch_eager         9.91%     288.756us        99.81%       2.907ms       2.907ms       0.000us         0.00%     125.503us     125.503us             1  
+                                              aten::mul         5.47%     159.428us         9.14%     266.247us      11.094us      65.088us        52.63%      65.088us       2.712us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.088us        52.63%      65.088us       2.712us            24  
+                                            aten::copy_         3.82%     111.411us        72.08%       2.100ms     116.650us      39.391us        31.85%      41.215us       2.290us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.799us        23.29%      28.799us       2.400us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.200us        15.52%      19.200us       1.600us            12  
+                                            aten::clone         0.71%      20.821us        69.14%       2.014ms     335.649us       0.000us         0.00%      12.416us       2.069us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us         8.56%      10.592us       1.765us             6  
+                                              aten::sub         1.35%      39.440us         2.20%      63.980us      10.663us       9.632us         7.79%       9.632us       1.605us             6  
+                                              aten::add         1.16%      33.802us         1.92%      55.961us       9.327us       9.568us         7.74%       9.568us       1.595us             6  
+                                Activity Buffer Request        59.81%       1.742ms        59.81%       1.742ms       1.742ms       1.824us         1.47%       1.824us       1.824us             1  
+                                    aten::empty_strided         1.06%      30.871us         1.06%      30.871us       5.145us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.32%     184.202us         6.32%     184.202us      30.700us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.20%      64.120us         2.78%      80.888us       3.370us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      16.768us         0.58%      16.768us       0.699us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.39%     215.298us         7.39%     215.298us       4.485us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.660us         0.19%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.656ms
-Self CUDA time total: 123.964us
+Self CPU time total: 2.913ms
+Self CUDA time total: 123.679us
 
 
 
@@ -4714,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     925.174us       889.06%     925.174us     925.174us             1  
-                                            torch_eager        20.56%     293.458us        99.64%       1.423ms       1.423ms       0.000us         0.00%     105.438us     105.438us             1  
-                                              aten::mul        10.42%     148.708us        18.32%     261.500us      10.896us      55.264us        53.11%      55.264us       2.303us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.264us        53.11%      55.264us       2.303us            24  
-                                            aten::copy_         7.08%     101.081us        43.33%     618.656us      34.370us      32.670us        31.39%      34.046us       1.891us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        23.74%      24.703us       2.059us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.128us        15.50%      16.128us       1.344us            12  
-                                            aten::clone         1.49%      21.220us        38.03%     542.913us      90.485us       0.000us         0.00%       9.343us       1.557us             6  
-                                              aten::sub         2.38%      33.992us         4.03%      57.481us       9.580us       8.064us         7.75%       8.064us       1.344us             6  
-                                              aten::add         2.21%      31.510us         3.80%      54.250us       9.042us       8.064us         7.75%       8.064us       1.344us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.967us         7.66%       7.967us       1.328us             6  
-                                Activity Buffer Request        14.99%     214.036us        14.99%     214.036us     214.036us       1.376us         1.32%       1.376us       1.376us             1  
-                                    aten::empty_strided         2.13%      30.461us         2.13%      30.461us       5.077us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.05%     243.458us        17.05%     243.458us      40.576us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.68%      66.831us         5.99%      85.500us       3.562us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.31%      18.669us         1.31%      18.669us       0.778us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.35%     219.102us        15.35%     219.102us       4.565us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.36%       5.101us         0.36%       5.101us       5.101us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     926.451us       888.37%     926.451us     926.451us             1  
+                                            torch_eager        20.56%     277.090us        99.61%       1.342ms       1.342ms       0.000us         0.00%     105.599us     105.599us             1  
+                                              aten::mul        11.75%     158.363us        19.88%     267.883us      11.162us      55.423us        53.14%      55.423us       2.309us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.423us        53.14%      55.423us       2.309us            24  
+                                            aten::copy_         7.94%     107.035us        40.62%     547.383us      30.410us      32.352us        31.02%      33.664us       1.870us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.63%      24.640us       2.053us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.512us        15.83%      16.512us       1.376us            12  
+                                            aten::clone         1.47%      19.840us        34.29%     462.099us      77.016us       0.000us         0.00%       9.024us       1.504us             6  
+                                              aten::sub         2.93%      39.461us         4.68%      63.054us      10.509us       8.287us         7.95%       8.287us       1.381us             6  
+                                              aten::add         2.50%      33.680us         4.16%      56.100us       9.350us       8.225us         7.89%       8.225us       1.371us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.39%       7.712us       1.285us             6  
+                                Activity Buffer Request        14.74%     198.654us        14.74%     198.654us     198.654us       1.312us         1.26%       1.312us       1.312us             1  
+                                    aten::empty_strided         2.26%      30.481us         2.26%      30.481us       5.080us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.39%     180.523us        13.39%     180.523us      30.087us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.73%      63.708us         5.98%      80.630us       3.360us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      16.922us         1.26%      16.922us       0.705us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.08%     216.704us        16.08%     216.704us       4.515us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.39%       5.231us         0.39%       5.231us       5.231us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.428ms
-Self CUDA time total: 104.062us
+Self CPU time total: 1.348ms
+Self CUDA time total: 104.287us
 
 
 
@@ -4744,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     901.909us       727.35%     901.909us     901.909us             1  
-                                            torch_eager        19.87%     274.810us        99.60%       1.377ms       1.377ms       0.000us         0.00%     125.791us     125.791us             1  
-                                              aten::mul        10.85%     149.967us        18.79%     259.807us      10.825us      65.086us        52.49%      65.086us       2.712us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.086us        52.49%      65.086us       2.712us            24  
-                                            aten::copy_         7.46%     103.216us        42.83%     592.168us      32.898us      39.518us        31.87%      41.310us       2.295us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.862us        23.28%      28.862us       2.405us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.395us        15.64%      19.395us       1.616us            12  
-                                            aten::clone         1.61%      22.200us        37.56%     519.385us      86.564us       0.000us         0.00%      12.448us       2.075us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.656us         8.59%      10.656us       1.776us             6  
-                                              aten::add         2.23%      30.899us         3.81%      52.660us       8.777us       9.730us         7.85%       9.730us       1.622us             6  
-                                              aten::sub         2.44%      33.801us         4.13%      57.151us       9.525us       9.665us         7.79%       9.665us       1.611us             6  
-                                Activity Buffer Request        13.62%     188.345us        13.62%     188.345us     188.345us       1.792us         1.45%       1.792us       1.792us             1  
-                                    aten::empty_strided         2.34%      32.371us         2.34%      32.371us       5.395us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        17.39%     240.467us        17.39%     240.467us      40.078us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.87%      67.397us         6.22%      86.038us       3.585us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.35%      18.641us         1.35%      18.641us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.56%     215.091us        15.56%     215.091us       4.481us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.40%       5.540us         0.40%       5.540us       5.540us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.662us       754.64%     931.662us     931.662us             1  
+                                            torch_eager        20.88%     278.302us        99.60%       1.328ms       1.328ms       0.000us         0.00%     125.281us     125.281us             1  
+                                              aten::mul        11.71%     156.112us        20.55%     273.936us      11.414us      65.153us        52.77%      65.153us       2.715us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.153us        52.77%      65.153us       2.715us            24  
+                                            aten::copy_         7.95%     105.951us        39.52%     526.779us      29.265us      39.169us        31.73%      40.993us       2.277us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.737us        23.28%      28.737us       2.395us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.135us        15.50%      19.135us       1.595us            12  
+                                            aten::clone         1.44%      19.200us        33.27%     443.406us      73.901us       0.000us         0.00%      12.256us       2.043us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.45%      10.432us       1.739us             6  
+                                              aten::sub         2.81%      37.440us         4.58%      61.110us      10.185us       9.632us         7.80%       9.632us       1.605us             6  
+                                              aten::add         2.52%      33.611us         4.17%      55.611us       9.268us       9.503us         7.70%       9.503us       1.584us             6  
+                                Activity Buffer Request        13.21%     176.083us        13.21%     176.083us     176.083us       1.824us         1.48%       1.824us       1.824us             1  
+                                    aten::empty_strided         2.29%      30.570us         2.29%      30.570us       5.095us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.82%     184.192us        13.82%     184.192us      30.699us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.90%      65.274us         6.16%      82.123us       3.422us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      16.849us         1.26%      16.849us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.81%     224.047us        16.81%     224.047us       4.668us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.310us         0.40%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.383ms
-Self CUDA time total: 123.999us
+Self CPU time total: 1.333ms
+Self CUDA time total: 123.457us
 
 
 
@@ -4774,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.661us       533.26%     944.661us     944.661us             1  
-                                            torch_eager        10.70%     284.298us        99.79%       2.652ms       2.652ms       0.000us         0.00%     180.029us     180.029us             1  
-                                              aten::mul         6.06%     161.074us        10.27%     272.980us      11.374us      94.781us        53.50%      94.781us       3.949us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.781us        53.50%      94.781us       3.949us            24  
-                                            aten::copy_         3.97%     105.392us        69.06%       1.835ms     101.961us      57.664us        32.55%      60.545us       3.364us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.607us        22.92%      40.607us       3.384us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        13.94%      24.703us       2.059us            12  
-                                            aten::clone         0.89%      23.759us        66.19%       1.759ms     293.179us       0.000us         0.00%      19.938us       3.323us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.057us         9.63%      17.057us       2.843us             6  
-                                              aten::sub         1.37%      36.511us         2.33%      61.971us      10.329us      12.383us         6.99%      12.383us       2.064us             6  
-                                              aten::add         1.17%      31.070us         2.01%      53.400us       8.900us      12.320us         6.95%      12.320us       2.053us             6  
-                                Activity Buffer Request        53.91%       1.433ms        53.91%       1.433ms       1.433ms       2.881us         1.63%       2.881us       2.881us             1  
-                                    aten::empty_strided         1.17%      31.132us         1.17%      31.132us       5.189us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.85%     235.245us         8.85%     235.245us      39.208us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.64%      70.123us         3.36%      89.202us       3.717us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.72%      19.079us         0.72%      19.079us       0.795us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.35%     221.788us         8.35%     221.788us       4.621us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.21%       5.460us         0.21%       5.460us       5.460us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.092us       532.26%     944.092us     944.092us             1  
+                                            torch_eager         9.66%     282.874us        99.81%       2.921ms       2.921ms       0.000us         0.00%     180.253us     180.253us             1  
+                                              aten::mul         5.51%     161.402us         9.28%     271.603us      11.317us      95.040us        53.58%      95.040us       3.960us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.040us        53.58%      95.040us       3.960us            24  
+                                            aten::copy_         3.62%     106.065us        72.07%       2.109ms     117.193us      57.663us        32.51%      60.543us       3.364us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.703us        22.95%      40.703us       3.392us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.670us        13.91%      24.670us       2.056us            12  
+                                            aten::clone         0.77%      22.428us        69.22%       2.026ms     337.680us       0.000us         0.00%      19.840us       3.307us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us         9.56%      16.960us       2.827us             6  
+                                              aten::add         1.16%      34.010us         1.95%      57.150us       9.525us      12.383us         6.98%      12.383us       2.064us             6  
+                                              aten::sub         1.32%      38.563us         2.15%      62.972us      10.495us      12.287us         6.93%      12.287us       2.048us             6  
+                                Activity Buffer Request        59.97%       1.755ms        59.97%       1.755ms       1.755ms       2.880us         1.62%       2.880us       2.880us             1  
+                                    aten::empty_strided         1.05%      30.691us         1.05%      30.691us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.31%     184.633us         6.31%     184.633us      30.772us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.32%      67.977us         2.88%      84.170us       3.507us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.55%      16.193us         0.55%      16.193us       0.675us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.56%     221.262us         7.56%     221.262us       4.610us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.19%       5.669us         0.19%       5.669us       5.669us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.657ms
-Self CUDA time total: 177.148us
+Self CPU time total: 2.927ms
+Self CUDA time total: 177.373us
 
 
 
@@ -4804,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.138us       321.69%     954.138us     954.138us             1  
-                                            torch_eager        11.45%     309.471us        99.80%       2.697ms       2.697ms       0.000us         0.00%     313.854us     313.854us             1  
-                                              aten::mul         5.62%     151.933us         9.84%     265.955us      11.081us     144.896us        48.85%     144.896us       6.037us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     144.896us        48.85%     144.896us       6.037us            24  
-                                            aten::copy_         3.99%     107.722us        68.69%       1.856ms     103.120us     111.039us        37.44%     128.287us       7.127us            18  
-                                            aten::clone         1.05%      28.369us        65.82%       1.779ms     296.444us       0.000us         0.00%      70.944us      11.824us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.33%      57.343us       4.779us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.696us        18.10%      53.696us       8.949us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.671us        13.71%      40.671us       3.389us            12  
-                                              aten::sub         1.32%      35.620us         2.23%      60.211us      10.035us      20.448us         6.89%      20.448us       3.408us             6  
-                                              aten::add         1.16%      31.420us         1.99%      53.831us       8.972us      20.223us         6.82%      20.223us       3.371us             6  
-                                Activity Buffer Request        53.66%       1.450ms        53.66%       1.450ms       1.450ms      17.248us         5.82%      17.248us      17.248us             1  
-                                    aten::empty_strided         1.25%      33.832us         1.25%      33.832us       5.639us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync         8.57%     231.556us         8.57%     231.556us      38.593us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         2.58%      69.773us         3.29%      88.953us       3.706us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.71%      19.180us         0.71%      19.180us       0.799us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel         8.44%     228.015us         8.44%     228.015us       4.750us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.20%       5.370us         0.20%       5.370us       5.370us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     956.029us       320.35%     956.029us     956.029us             1  
+                                            torch_eager        10.28%     306.488us        99.82%       2.977ms       2.977ms       0.000us         0.00%     316.194us     316.194us             1  
+                                              aten::mul         5.10%     152.001us         8.95%     266.845us      11.119us     146.560us        49.11%     146.560us       6.107us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.560us        49.11%     146.560us       6.107us            24  
+                                            aten::copy_         3.72%     110.901us        71.64%       2.137ms     118.718us     110.754us        37.11%     128.514us       7.140us            18  
+                                            aten::clone         0.97%      28.901us        68.99%       2.058ms     342.957us       0.000us         0.00%      70.944us      11.824us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.570us        19.29%      57.570us       4.797us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.184us        17.82%      53.184us       8.864us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.120us        13.78%      41.120us       3.427us            12  
+                                              aten::add         1.16%      34.740us         1.93%      57.500us       9.583us      20.641us         6.92%      20.641us       3.440us             6  
+                                              aten::sub         1.34%      39.998us         2.18%      65.101us      10.850us      20.479us         6.86%      20.479us       3.413us             6  
+                                Activity Buffer Request        59.58%       1.777ms        59.58%       1.777ms       1.777ms      17.760us         5.95%      17.760us      17.760us             1  
+                                    aten::empty_strided         1.05%      31.260us         1.05%      31.260us       5.210us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         6.26%     186.663us         6.26%     186.663us      31.111us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.24%      66.809us         2.82%      84.238us       3.510us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.58%      17.429us         0.58%      17.429us       0.726us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         7.54%     224.919us         7.54%     224.919us       4.686us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.18%       5.469us         0.18%       5.469us       5.469us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.702ms
-Self CUDA time total: 296.606us
+Self CPU time total: 2.983ms
+Self CUDA time total: 298.434us
 
 
 
@@ -4834,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     930.130us       525.53%     930.130us     930.130us             1  
-                                            torch_eager        19.64%     282.826us        99.65%       1.435ms       1.435ms       0.000us         0.00%     179.836us     179.836us             1  
-                                              aten::mul        10.48%     150.844us        18.43%     265.387us      11.058us      94.845us        53.59%      94.845us       3.952us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.845us        53.59%      94.845us       3.952us            24  
-                                            aten::copy_         8.38%     120.684us        44.09%     634.887us      35.272us      57.502us        32.49%      60.350us       3.353us            18  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.478us        22.87%      40.478us       3.373us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        13.92%      24.641us       2.053us            12  
-                                            aten::clone         1.49%      21.461us        38.48%     554.053us      92.342us       0.000us         0.00%      19.872us       3.312us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.62%      17.024us       2.837us             6  
-                                              aten::sub         2.41%      34.731us         4.09%      58.881us       9.813us      12.353us         6.98%      12.353us       2.059us             6  
-                                              aten::add         2.13%      30.662us         3.72%      53.511us       8.919us      12.288us         6.94%      12.288us       2.048us             6  
-                                Activity Buffer Request        15.30%     220.275us        15.30%     220.275us     220.275us       2.848us         1.61%       2.848us       2.848us             1  
-                                    aten::empty_strided         2.11%      30.450us         2.11%      30.450us       5.075us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        15.99%     230.296us        15.99%     230.296us      38.383us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.74%      68.240us         6.08%      87.483us       3.645us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.34%      19.243us         1.34%      19.243us       0.802us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.64%     225.174us        15.64%     225.174us       4.691us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       5.110us         0.35%       5.110us       5.110us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.392us       515.61%     916.392us     916.392us             1  
+                                            torch_eager        19.58%     274.201us        99.60%       1.394ms       1.394ms       0.000us         0.00%     180.610us     180.610us             1  
+                                              aten::mul        11.24%     157.371us        18.87%     264.183us      11.008us      95.074us        53.49%      95.074us       3.961us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.074us        53.49%      95.074us       3.961us            24  
+                                            aten::copy_         7.77%     108.775us        43.49%     608.863us      33.826us      57.825us        32.54%      60.705us       3.373us            18  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.897us        23.01%      40.897us       3.408us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.831us        13.97%      24.831us       2.069us            12  
+                                            aten::clone         1.40%      19.580us        37.38%     523.368us      87.228us       0.000us         0.00%      19.808us       3.301us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.52%      16.928us       2.821us             6  
+                                              aten::add         2.38%      33.360us         4.00%      56.040us       9.340us      12.416us         6.99%      12.416us       2.069us             6  
+                                              aten::sub         2.76%      38.582us         4.39%      61.472us      10.245us      12.415us         6.99%      12.415us       2.069us             6  
+                                Activity Buffer Request        18.14%     253.955us        18.14%     253.955us     253.955us       2.880us         1.62%       2.880us       2.880us             1  
+                                    aten::empty_strided         2.13%      29.860us         2.13%      29.860us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.38%     187.273us        13.38%     187.273us      31.212us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.53%      63.391us         5.73%      80.293us       3.346us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.21%      16.902us         1.21%      16.902us       0.704us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.09%     211.242us        15.09%     211.242us       4.401us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.40%       5.600us         0.40%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.440ms
-Self CUDA time total: 176.988us
+Self CPU time total: 1.400ms
+Self CUDA time total: 177.730us
 
 
 
@@ -4864,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.347us       313.60%     931.347us     931.347us             1  
-                                            torch_eager        20.13%     283.358us        99.65%       1.403ms       1.403ms       0.000us         0.00%     314.679us     314.679us             1  
-                                              aten::mul        10.72%     150.883us        18.79%     264.457us      11.019us     145.371us        48.95%     145.371us       6.057us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.371us        48.95%     145.371us       6.057us            24  
-                                            aten::copy_         7.40%     104.164us        42.97%     604.868us      33.604us     110.845us        37.32%     128.541us       7.141us            18  
-                                            aten::clone         1.53%      21.600us        37.15%     522.944us      87.157us       0.000us         0.00%      71.357us      11.893us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.184us        19.25%      57.184us       4.765us            12  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.661us        18.07%      53.661us       8.944us             6  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.767us        13.73%      40.767us       3.397us            12  
-                                              aten::add         2.28%      32.151us         3.88%      54.682us       9.114us      20.446us         6.88%      20.446us       3.408us             6  
-                                              aten::sub         2.39%      33.622us         4.06%      57.171us       9.528us      20.321us         6.84%      20.321us       3.387us             6  
-                                Activity Buffer Request        14.77%     207.975us        14.77%     207.975us     207.975us      17.696us         5.96%      17.696us      17.696us             1  
-                                    aten::empty_strided         2.15%      30.270us         2.15%      30.270us       5.045us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.22%     228.377us        16.22%     228.377us      38.063us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.75%      66.830us         6.13%      86.290us       3.595us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.38%      19.460us         1.38%      19.460us       0.811us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.91%     224.006us        15.91%     224.006us       4.667us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.35%       4.971us         0.35%       4.971us       4.971us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.618us       312.71%     934.618us     934.618us             1  
+                                            torch_eager        20.60%     280.895us        99.62%       1.358ms       1.358ms       0.000us         0.00%     316.921us     316.921us             1  
+                                              aten::mul        11.57%     157.759us        19.61%     267.373us      11.141us     146.460us        49.00%     146.460us       6.102us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.460us        49.00%     146.460us       6.102us            24  
+                                            aten::copy_         8.07%     110.072us        41.19%     561.700us      31.206us     111.966us        37.46%     130.013us       7.223us            18  
+                                            aten::clone         1.51%      20.600us        34.77%     474.096us      79.016us       0.000us         0.00%      72.670us      12.112us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.19%      57.343us       4.779us            12  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.623us        18.28%      54.623us       9.104us             6  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.448us        13.53%      40.448us       3.371us            12  
+                                              aten::add         2.59%      35.260us         4.22%      57.590us       9.598us      20.288us         6.79%      20.288us       3.381us             6  
+                                              aten::sub         2.60%      35.410us         4.30%      58.621us       9.770us      20.160us         6.75%      20.160us       3.360us             6  
+                                Activity Buffer Request        14.73%     200.853us        14.73%     200.853us     200.853us      18.047us         6.04%      18.047us      18.047us             1  
+                                    aten::empty_strided         2.18%      29.660us         2.18%      29.660us       4.943us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.85%     188.823us        13.85%     188.823us      31.471us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.75%      64.754us         6.01%      81.922us       3.413us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.26%      17.168us         1.26%      17.168us       0.715us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        15.92%     217.107us        15.92%     217.107us       4.523us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.38%       5.180us         0.38%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.408ms
-Self CUDA time total: 296.983us
+Self CPU time total: 1.364ms
+Self CUDA time total: 298.874us
 
 
 
@@ -4894,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.511us       159.85%     931.511us     931.511us             1  
-                                            torch_eager        19.89%     283.237us        99.62%       1.419ms       1.419ms       0.000us         0.00%     606.457us     606.457us             1  
-                                            aten::copy_         7.21%     102.593us        43.52%     619.697us      34.428us     267.708us        45.94%     291.419us      16.190us            18  
-                                              aten::mul        10.56%     150.425us        18.55%     264.165us      11.007us     249.406us        42.80%     249.406us      10.392us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     249.406us        42.80%     249.406us      10.392us            24  
-                                            aten::clone         1.52%      21.631us        38.04%     541.603us      90.267us       0.000us         0.00%     201.277us      33.546us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     177.566us        30.47%     177.566us      29.594us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.142us        15.47%      90.142us       7.512us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.632us        11.26%      65.632us       5.469us            12  
-                                              aten::add         2.16%      30.762us         3.77%      53.662us       8.944us      32.832us         5.63%      32.832us       5.472us             6  
-                                              aten::sub         2.53%      36.013us         4.23%      60.192us      10.032us      32.800us         5.63%      32.800us       5.467us             6  
-                                Activity Buffer Request        14.90%     212.145us        14.90%     212.145us     212.145us      23.711us         4.07%      23.711us      23.711us             1  
-                                    aten::empty_strided         2.14%      30.440us         2.14%      30.440us       5.073us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        16.99%     241.846us        16.99%     241.846us      40.308us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         4.71%      67.093us         6.00%      85.482us       3.562us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         1.29%      18.389us         1.29%      18.389us       0.766us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        15.73%     223.932us        15.73%     223.932us       4.665us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize         0.38%       5.360us         0.38%       5.360us       5.360us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     956.919us       161.50%     956.919us     956.919us             1  
+                                            torch_eager        21.30%     289.504us        99.57%       1.353ms       1.353ms       0.000us         0.00%     616.281us     616.281us             1  
+                                            aten::copy_         7.84%     106.532us        38.89%     528.548us      29.364us     278.013us        46.92%     301.788us      16.766us            18  
+                                              aten::mul        11.95%     162.407us        20.79%     282.469us      11.770us     248.703us        41.97%     248.703us      10.363us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     248.703us        41.97%     248.703us      10.363us            24  
+                                            aten::clone         1.53%      20.799us        32.73%     444.735us      74.123us       0.000us         0.00%     210.204us      35.034us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     186.429us        31.46%     186.429us      31.072us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.584us        15.46%      91.584us       7.632us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.790us        11.10%      65.790us       5.483us            12  
+                                              aten::add         2.44%      33.161us         4.08%      55.501us       9.250us      32.927us         5.56%      32.927us       5.488us             6  
+                                              aten::sub         2.95%      40.030us         4.74%      64.440us      10.740us      32.863us         5.55%      32.863us       5.477us             6  
+                                Activity Buffer Request        13.07%     177.663us        13.07%     177.663us     177.663us      23.775us         4.01%      23.775us      23.775us             1  
+                                    aten::empty_strided         2.15%      29.270us         2.15%      29.270us       4.878us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync        13.63%     185.172us        13.63%     185.172us      30.862us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         4.83%      65.662us         6.08%      82.660us       3.444us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         1.25%      16.998us         1.25%      16.998us       0.708us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel        16.63%     225.993us        16.63%     225.993us       4.708us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize         0.43%       5.780us         0.43%       5.780us       5.780us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.424ms
-Self CUDA time total: 582.746us
+Self CPU time total: 1.359ms
+Self CUDA time total: 592.506us
 
 
 
@@ -4924,59 +4706,105 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager        13.84%     306.170us        64.60%       1.429ms       1.429ms       0.000us         0.00%       1.835ms       1.835ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.808ms       102.17%       1.808ms       1.808ms             1  
-                                            aten::copy_         5.17%     114.346us        26.90%     594.995us      33.055us     791.984us        44.77%     858.095us      47.672us            18  
-                                              aten::mul         6.78%     150.032us        12.17%     269.044us      11.210us     828.790us        46.85%     828.790us      34.533us            24  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     828.790us        46.85%     828.790us      34.533us            24  
-                                            aten::clone         1.04%      23.090us        22.74%     502.934us      83.822us       0.000us         0.00%     626.230us     104.372us             6  
-                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     560.119us        31.66%     560.119us      93.353us             6  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     231.865us        13.11%     231.865us      19.322us            12  
-void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     148.413us         8.39%     148.413us      12.368us            12  
-                                              aten::sub         1.69%      37.309us         2.75%      60.900us      10.150us      90.142us         5.10%      90.142us      15.024us             6  
-                                Activity Buffer Request         8.38%     185.324us         8.38%     185.324us     185.324us      66.111us         3.74%      66.111us      66.111us             1  
-                                              aten::add         1.41%      31.181us         2.49%      55.022us       9.170us      58.271us         3.29%      58.271us       9.712us             6  
-                                    aten::empty_strided         1.45%      31.982us         1.45%      31.982us       5.330us       0.000us         0.00%       0.000us       0.000us             6  
-                                        cudaMemcpyAsync        10.29%     227.584us        10.29%     227.584us      37.931us       0.000us         0.00%       0.000us       0.000us             6  
-                                            aten::slice         3.11%      68.695us         3.96%      87.553us       3.648us       0.000us         0.00%       0.000us       0.000us            24  
-                                       aten::as_strided         0.85%      18.858us         0.85%      18.858us       0.786us       0.000us         0.00%       0.000us       0.000us            24  
-                                       cudaLaunchKernel        10.59%     234.185us        10.59%     234.185us       4.879us       0.000us         0.00%       0.000us       0.000us            48  
-                                  cudaDeviceSynchronize        35.40%     782.770us        35.40%     782.770us     782.770us       0.000us         0.00%       0.000us       0.000us             1  
+                                            torch_eager        12.69%     276.287us        61.52%       1.340ms       1.340ms       0.000us         0.00%       1.863ms       1.863ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.835ms       102.22%       1.835ms       1.835ms             1  
+                                            aten::copy_         5.01%     109.060us        24.98%     544.137us      30.230us     806.007us        44.89%     873.590us      48.533us            18  
+                                              aten::mul         7.11%     154.844us        12.06%     262.604us      10.942us     842.615us        46.93%     842.615us      35.109us            24  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     842.615us        46.93%     842.615us      35.109us            24  
+                                            aten::clone         1.01%      22.000us        21.12%     459.916us      76.653us       0.000us         0.00%     622.361us     103.727us             6  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     554.778us        30.90%     554.778us      92.463us             6  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.229us        13.99%     251.229us      20.936us            12  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     146.939us         8.18%     146.939us      12.245us            12  
+                                              aten::sub         1.90%      41.421us         3.00%      65.411us      10.902us      88.573us         4.93%      88.573us      14.762us             6  
+                                Activity Buffer Request         8.49%     184.983us         8.49%     184.983us     184.983us      67.583us         3.76%      67.583us      67.583us             1  
+                                              aten::add         1.54%      33.561us         2.59%      56.461us       9.410us      58.366us         3.25%      58.366us       9.728us             6  
+                                    aten::empty_strided         1.42%      30.960us         1.42%      30.960us       5.160us       0.000us         0.00%       0.000us       0.000us             6  
+                                        cudaMemcpyAsync         8.70%     189.543us         8.70%     189.543us      31.591us       0.000us         0.00%       0.000us       0.000us             6  
+                                            aten::slice         2.99%      65.113us         3.77%      82.061us       3.419us       0.000us         0.00%       0.000us       0.000us            24  
+                                       aten::as_strided         0.78%      16.948us         0.78%      16.948us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
+                                       cudaLaunchKernel         9.88%     215.201us         9.88%     215.201us       4.483us       0.000us         0.00%       0.000us       0.000us            48  
+                                  cudaDeviceSynchronize        38.48%     838.063us        38.48%     838.063us     838.063us       0.000us         0.00%       0.000us       0.000us             1  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.212ms
-Self CUDA time total: 1.769ms
+Self CPU time total: 2.178ms
+Self CUDA time total: 1.796ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B1_S128_H32_D128_R64     0.21  True
-torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
-torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
-torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
-torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
+torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
+torch_eager              cuda_B1_S128_H32_D64_R32     0.23  True
+torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
+torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
+torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
-torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
-torch_eager              cuda_B1_S512_H8_D64_R32     0.21  True
-torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
+torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
+torch_eager              cuda_B1_S512_H8_D128_R64     0.23  True
+torch_eager              cuda_B1_S512_H8_D64_R32     0.23  True
+torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
 torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S128_H8_D128_R64     0.21  True
+torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
 torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
-torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
+torch_eager              cuda_B2_S2048_H32_D128_R64     0.65  True
 torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
 torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
 torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
 torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
-torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
-torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
-torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
+torch_eager              cuda_B2_S512_H32_D64_R32     0.23  True
+torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
+torch_eager              cuda_B2_S512_H8_D64_R32     0.23  True
 
▶ UV Install Logs