diff --git "a/rotary/impls/torch_rotary.html" "b/rotary/impls/torch_rotary.html" --- "a/rotary/impls/torch_rotary.html" +++ "b/rotary/impls/torch_rotary.html" @@ -57,8 +57,12 @@ // For HTML files, navigate within iframe if (fullPath.endsWith('.html') || fullPath.endsWith('/')) { const pathParts = fullPath.split('/').filter(p => p); - const targetFile = pathParts[pathParts.length - 1] || 'index.html'; - window.location.href = targetFile; + // remove "#/" prefix if present + let iframePath = fullPath; + if (iframePath.startsWith('#/')) { + iframePath = iframePath.slice(2); + } + window.location.href = "/" + iframePath; } else { // For non-HTML files (raw .py, etc), open directly window.open(href, '_blank'); @@ -99,16 +103,16 @@ --bg-error: #fdf2f2; --bg-artifact: #e6f3ff; --bg-artifact-hover: #d0e7ff; - + --text-primary: #333; --text-secondary: #656d76; --text-error: #c53030; --text-link: #0969da; - + --border-primary: #e1e5e9; --border-error: #e53e3e; --border-cell-failed: #d73a49; - + --shadow: rgba(0, 0, 0, 0.1); } @@ -120,32 +124,26 @@ --bg-error: #1a0f0f; --bg-artifact: #151515; --bg-artifact-hover: #1a1a1a; - + --text-primary: #e0e0e0; --text-secondary: #888888; --text-error: #ff6b6b; --text-link: #64b5f6; - + --border-primary: #2a2a2a; --border-error: #ff6b6b; --border-cell-failed: #ff6b6b; - + --shadow: rgba(255, 255, 255, 0.05); } - /* Monocolor UI theme: black/white background, all text/borders single blue */ -:root[data-ui="monocolor"] { - --mono-color: #0a66ff; -} - +:root[data-ui="monocolor"] { --mono-color: #0a66ff; } :root[data-ui="monocolor"][data-theme="light"] { --bg-primary: #ffffff; } - :root[data-ui="monocolor"][data-theme="dark"] { --bg-primary: #000000; } - :root[data-ui="monocolor"] { --bg-secondary: var(--bg-primary); --bg-tertiary: var(--bg-primary); @@ -165,76 +163,25 @@ --shadow: none; } - -:root[data-ui="monocolor"] a { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] a { color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button, :root[data-ui="monocolor"] .theme-toggle, :root[data-ui="monocolor"] .reset-toggle, -:root[data-ui="monocolor"] .back-button { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .back-button { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } :root[data-ui="monocolor"] .menu-button:hover, :root[data-ui="monocolor"] .theme-toggle:hover, :root[data-ui="monocolor"] .reset-toggle:hover, -:root[data-ui="monocolor"] .back-button:hover { - background: var(--bg-primary); - color: var(--mono-color); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .menu-dropdown { - background: var(--bg-primary); - border-color: var(--mono-color); - box-shadow: none; -} - -:root[data-ui="monocolor"] .menu-item { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .system-info { - background: var(--bg-primary); - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .cell { - border-color: var(--mono-color); - background: var(--bg-primary); -} - -:root[data-ui="monocolor"] .cell-header { - background: var(--bg-primary); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .artifact:hover { - background: var(--bg-primary); -} - +:root[data-ui="monocolor"] .back-button:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; } +:root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); } +:root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); } +:root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); } :root[data-ui="monocolor"] .artifact-preview img, -:root[data-ui="monocolor"] .artifact-preview svg { - border-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .status-widget { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); } +:root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } :root[data-ui="monocolor"] .minimap, :root[data-ui="monocolor"] .file-explorer, :root[data-ui="monocolor"] .tools-widget { @@ -242,54 +189,23 @@ border-color: var(--mono-color); color: var(--mono-color); } - :root[data-ui="monocolor"] .cell-code { background: var(--bg-primary); border-bottom-color: var(--mono-color); } - :root[data-ui="monocolor"] .tools-title, :root[data-ui="monocolor"] .file-explorer-section-title, -:root[data-ui="monocolor"] .minimap-title { - color: var(--mono-color); - border-bottom-color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button { - background: var(--bg-primary); - border-color: var(--mono-color); - color: var(--mono-color); -} - -:root[data-ui="monocolor"] .tool-button.active { - border-color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); } +:root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); } :root[data-ui="monocolor"] .file-explorer-item, -:root[data-ui="monocolor"] .minimap-item { - color: var(--mono-color); -} - +:root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); } /* Force Pygments code to mono blue on mono bg */ -:root[data-ui="monocolor"] .highlight { - background: var(--bg-primary) !important; - color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; } :root[data-ui="monocolor"] .highlight *, -:root[data-ui="monocolor"] .highlight .hll { - color: var(--mono-color) !important; - background: transparent !important; - border-color: var(--mono-color) !important; -} - +:root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; } /* Default code font + metrics (overridable via frontmatter) */ -:root { - --code-font-size: 0.95rem; - --code-line-height: 1.5; - --code-pad-y: 0.75rem; -} - +:root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; } /* Minimal UI theme overrides base variables for a flatter, 90s look */ :root[data-ui="none"] { --bg-primary: #ffffff; @@ -311,11 +227,9 @@ --shadow: none; } - html { overscroll-behavior: none; } - body { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; line-height: 1.4; @@ -327,7 +241,6 @@ body { transition: background-color 0.2s ease, color 0.2s ease; overscroll-behavior: none; } - /* Minimal "none" UI theme overrides */ :root[data-ui="none"] body { font-family: 'Times New Roman', Times, serif; @@ -351,11 +264,7 @@ body { gap: 0.25rem; z-index: 1000; } - -.controls-buttons { - display: flex; - gap: 0.5rem; -} +.controls-buttons { display: flex; gap: 0.5rem; } .menu-button { position: relative; @@ -369,7 +278,6 @@ body { font-size: 0.9rem; user-select: none; } - /* Keep default control styling when widgets are enabled, even in minimal UI */ :root[data-ui="none"][data-widgets="on"] .menu-button, :root[data-ui="none"][data-widgets="on"] .theme-toggle, @@ -384,7 +292,6 @@ body { color: var(--text-primary); background: var(--bg-tertiary); } - /* Controls state indicator (top-right) */ /* Status widget (bottom-right) */ .status-widget { @@ -401,17 +308,8 @@ body { color: var(--text-secondary); z-index: 100; } - -.status-widget strong { - color: var(--text-primary); -} - -:root[data-ui="none"][data-widgets="on"] .status-widget { - background: #f6f6f6; - border-color: #ccc; - color: #222; -} - +.status-widget strong { color: var(--text-primary); } +:root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; } :root[data-ui="none"][data-widgets="on"] .menu-button:hover, :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover, :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover, @@ -437,12 +335,7 @@ body { z-index: 1001; margin-top: 4px; } - -:root[data-ui="none"][data-widgets="on"] .menu-dropdown { - background: #ffffff; - border: 1px solid #cccccc; - box-shadow: none; -} +:root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; } .menu-button.active .menu-dropdown { opacity: 1; @@ -459,11 +352,7 @@ body { border-bottom: 1px solid var(--border-primary); cursor: pointer; } - -:root[data-ui="none"] .menu-item { - color: #000; - border-bottom: 1px solid #eee; -} +:root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; } .menu-item:last-child { border-bottom: none; @@ -527,8 +416,7 @@ body { font-family: monospace; } -.theme-toggle, -.reset-toggle { +.theme-toggle, .reset-toggle { background: var(--bg-secondary); border: 1px solid var(--border-primary); border-radius: 2px; @@ -543,8 +431,7 @@ body { letter-spacing: 0; } -.theme-toggle:hover, -.reset-toggle:hover { +.theme-toggle:hover, .reset-toggle:hover { background: var(--bg-tertiary); border-color: var(--text-secondary); color: var(--text-primary); @@ -566,20 +453,16 @@ body { opacity: 0.9; transition: opacity 0.2s ease; } - /* Hide widgets and controls when disabled via frontmatter */ :root[data-widgets="off"] .controls, :root[data-widgets="off"] .minimap, :root[data-widgets="off"] .file-explorer, :root[data-widgets="off"] .tools-widget, -:root[data-widgets="off"] .status-widget { - display: none !important; -} +:root[data-widgets="off"] .status-widget { display: none !important; } .file-explorer { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -603,18 +486,15 @@ body { left: 0; width: 100vw; height: 100vh; - z-index: 80; - /* under widgets (100) and controls (1000) */ + z-index: 80; /* under widgets (100) and controls (1000) */ display: block; - pointer-events: none; - /* enabled only when a tool is active */ + pointer-events: none; /* enabled only when a tool is active */ } /* Tools widget */ .tools-widget { position: fixed; - bottom: 20px; - /* default; JS will stack */ + bottom: 20px; /* default; JS will stack */ right: 20px; left: auto; top: auto; @@ -627,7 +507,6 @@ body { z-index: 100; opacity: 0.95; } - .tools-title { font-weight: bold; color: var(--text-secondary); @@ -637,13 +516,7 @@ body { cursor: grab; user-select: none; } - -.tools-row { - display: flex; - gap: 0.4rem; - flex-wrap: wrap; -} - +.tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; } .tool-button { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -655,19 +528,10 @@ body { font-size: 0.75rem; user-select: none; } +.tool-button:hover { color: var(--text-primary); } +.tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); } -.tool-button:hover { - color: var(--text-primary); -} - -.tool-button.active { - color: var(--text-primary); - border-color: var(--text-secondary); - background: var(--bg-secondary); -} - -.minimap:hover, -.file-explorer:hover { +.minimap:hover, .file-explorer:hover { opacity: 1; } @@ -677,8 +541,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -707,29 +570,12 @@ body { font-weight: normal; } -.minimap-heading.h1 { - padding-left: 0.5rem; -} - -.minimap-heading.h2 { - padding-left: 1rem; -} - -.minimap-heading.h3 { - padding-left: 1.5rem; -} - -.minimap-heading.h4 { - padding-left: 2rem; -} - -.minimap-heading.h5 { - padding-left: 2.5rem; -} - -.minimap-heading.h6 { - padding-left: 3rem; -} +.minimap-heading.h1 { padding-left: 0.5rem; } +.minimap-heading.h2 { padding-left: 1rem; } +.minimap-heading.h3 { padding-left: 1.5rem; } +.minimap-heading.h4 { padding-left: 2rem; } +.minimap-heading.h5 { padding-left: 2.5rem; } +.minimap-heading.h6 { padding-left: 3rem; } .minimap-cell { color: var(--text-link); @@ -747,8 +593,7 @@ body { margin-bottom: 0.5rem; padding-bottom: 0.25rem; border-bottom: 1px solid var(--border-primary); - cursor: grab; - /* drag handle */ + cursor: grab; /* drag handle */ user-select: none; } @@ -792,10 +637,7 @@ body { /* Hide widgets on smaller screens */ @media (max-width: 768px) { - - .minimap, - .file-explorer, - .tools-widget { + .minimap, .file-explorer, .tools-widget { display: none; } } @@ -807,13 +649,7 @@ body { overflow: hidden; background: var(--bg-secondary); } - -:root[data-ui="none"] .cell { - margin: 1em 0; - border: none; - background: transparent; -} - +:root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; } .cell-header { background: var(--bg-secondary); padding: 0.5rem 1rem; @@ -821,72 +657,39 @@ body { font-family: inherit; font-size: 0.85rem; } - -:root[data-ui="none"] .cell-header { - background: transparent; - border: none; - padding: 0; - font-weight: bold; -} - -:root[data-ui="none"] .cell-content { - padding: 0; -} - +:root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; } +:root[data-ui="none"] .cell-content { padding: 0; } :root[data-ui="none"] .copy-button, :root[data-ui="none"] .collapse-indicators, :root[data-ui="none"] .cell-meta, -:root[data-ui="none"] .cell-outputs-header { - display: none !important; -} - -:root[data-ui="none"] pre, -:root[data-ui="none"] code { - font-family: Menlo, Monaco, 'Courier New', monospace; -} - -:root[data-ui="none"] .code-content pre { - background: #f9f9f9; - border: 1px solid #ddd; - padding: 8px; -} - -:root[data-ui="none"] .output { - background: transparent; - border: none; - padding: 0.25em 0; -} - -color: var(--text-secondary); -cursor: pointer; -user-select: none; -transition: background-color 0.2s ease; +:root[data-ui="none"] .cell-outputs-header { display: none !important; } +:root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; } +:root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; } +:root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; } + color: var(--text-secondary); + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease; } - .cell-header:hover { background: var(--bg-tertiary); } - .collapse-indicators { color: var(--text-secondary); font-size: 0.8rem; opacity: 0.7; } - .collapse-indicators span:hover { color: var(--text-primary); opacity: 1; } - .cell-code { display: block; background: var(--bg-code); } - .cell-code.collapsed { display: none; } - .cell-code pre { margin: 0; padding: 0.75rem; @@ -894,17 +697,14 @@ transition: background-color 0.2s ease; overflow-x: auto; color: var(--text-primary); } - .cell-output { padding: 0.75rem; /* background: var(--bg-primary); */ background: var(--bg-secondary); } - .cell-output.collapsed { display: none; } - .cell-stdout { background: var(--bg-tertiary); padding: 0.75rem; @@ -925,21 +725,15 @@ transition: background-color 0.2s ease; color: var(--text-primary); /* key bits */ - overflow: auto; - /* show scrollbars when needed */ - max-width: 100%; - /* respects whatever layout width you give it */ + overflow: auto; /* show scrollbars when needed */ + max-width: 100%; /* respects whatever layout width you give it */ } .cell-stdout .stdout-text { - margin: 0; - /* reset pre default margin */ - white-space: pre; - /* keep line breaks, NO wrapping */ - display: inline-block; - /* shrink-to-content */ - min-width: max-content; - /* allow very long lines to define intrinsic width */ + margin: 0; /* reset pre default margin */ + white-space: pre; /* keep line breaks, NO wrapping */ + display: inline-block; /* shrink-to-content */ + min-width: max-content; /* allow very long lines to define intrinsic width */ font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; tab-size: 2; } @@ -954,11 +748,9 @@ transition: background-color 0.2s ease; color: var(--text-error); white-space: pre-wrap; } - .uv-install-logs { margin: 0.5rem 0; } - .uv-logs-header { cursor: pointer; padding: 0.75rem; @@ -968,7 +760,6 @@ transition: background-color 0.2s ease; color: var(--text-secondary); user-select: none; } - .uv-logs-content { background: var(--bg-secondary); padding: 1rem; @@ -979,17 +770,14 @@ transition: background-color 0.2s ease; color: var(--text-secondary); overflow-x: auto; } - .cell-artifacts { margin: 1rem 0; } - .cell-artifacts h4 { margin: 0 0 0.5rem 0; color: var(--text-secondary); font-size: 0.9rem; } - .artifact { display: inline-block; background: var(--bg-artifact); @@ -1003,22 +791,18 @@ transition: background-color 0.2s ease; transition: background-color 0.2s ease; border: 1px solid var(--border-primary); } - .artifact:hover { background: var(--bg-artifact-hover); } - .artifact-preview { margin-top: 1rem; } - .artifact-preview img { max-width: 100%; height: auto; border: 1px solid var(--border-primary); border-radius: 1px; } - .artifact-preview svg { max-width: 100%; height: auto; @@ -1026,33 +810,27 @@ transition: background-color 0.2s ease; border-radius: 1px; display: block; } - /* Style SVG text elements */ .artifact-preview svg g { fill: var(--text-primary) !important; } - /* Auto-theme SVG elements */ .artifact-preview svg { background: transparent; } - /* Invert SVG images in dark mode */ :root[data-theme="dark"] .artifact-preview img[src$=".svg"] { filter: invert(0.9) hue-rotate(180deg); } - /* Keep SVG images readable in monocolor mode */ :root[data-ui="monocolor"] .artifact-preview img[src$=".svg"] { filter: none; } - /* CSV table styling */ .artifact-csv { margin-top: 1rem; overflow-x: auto; } - .csv-table { width: 100%; border-collapse: collapse; @@ -1061,24 +839,20 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-primary); border-radius: 1px; } - .csv-table th, .csv-table td { padding: 0.5rem 0.75rem; text-align: left; border: 1px solid var(--border-primary); } - .csv-table th { background: var(--bg-tertiary); font-weight: 600; color: var(--text-primary); } - .csv-table tbody tr:hover { background: var(--bg-artifact-hover); } - .artifact-csv-error { margin-top: 1rem; padding: 1rem; @@ -1087,27 +861,22 @@ transition: background-color 0.2s ease; border: 1px solid var(--border-error); border-radius: 1px; } - .cell-failed { border-color: var(--border-cell-failed); } - .cell-failed .cell-header { background: var(--bg-error); color: var(--text-error); } - .cell-commented { opacity: 0.6; border-style: dashed; } - .cell-commented .cell-header { background: var(--bg-secondary); color: var(--text-secondary); font-style: italic; } - .run-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1119,17 +888,14 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .run-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .run-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1141,24 +907,20 @@ transition: background-color 0.2s ease; font-family: inherit; margin-left: 4px; } - .copy-btn:hover { color: var(--text-primary); background: var(--bg-primary); } - .copy-btn:disabled { opacity: 0.6; cursor: not-allowed; } - .copy-btn.copied { color: #4caf50; background: var(--bg-primary); border-color: #4caf50; transition: all 0.2s ease; } - .raw-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1172,13 +934,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .raw-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .github-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1192,13 +952,11 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .github-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .hf-btn { background: var(--bg-tertiary); border: 1px solid var(--border-primary); @@ -1212,18 +970,15 @@ transition: background-color 0.2s ease; text-decoration: none; display: inline-block; } - .hf-btn:hover { color: var(--text-primary); background: var(--bg-primary); text-decoration: none; } - .output-stale { opacity: 0.5; position: relative; } - .output-stale::after { content: '⏳ updating...'; position: absolute; @@ -1236,77 +991,41 @@ transition: background-color 0.2s ease; color: var(--text-secondary); border: 1px solid var(--border-primary); } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { margin-top: 1.5rem; margin-bottom: 0.75rem; color: var(--text-primary); } - h1 { margin-top: 0; margin-bottom: 1rem; } - p { margin: 0.75rem 0; color: var(--text-primary); } - a { color: var(--text-link); } - img { max-width: 100%; height: auto; border-radius: 1px; box-shadow: none; } - -pre, -code { +pre, code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace; font-size: var(--code-font-size); } - -.code-wrap { - position: relative; -} - -.code-line-highlight { - display: none; - position: absolute; - left: 0; - right: 0; - height: 1.5em; - background: rgba(255, 235, 170, 0.35); - pointer-events: none; - border-left: 3px solid #f4c542; -} - -.line-number { - cursor: pointer; - text-decoration: none; - color: var(--text-secondary); - padding: 0 0.25rem; -} - -.line-number.selected { - background: rgba(255, 235, 170, 0.4); - color: var(--text-primary); -} +.code-wrap { position: relative; } +.code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; } +.line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; } +.line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); } /* Line numbers */ .highlight-with-lines { display: flex; } - .line-numbers { background: var(--bg-tertiary); padding: var(--code-pad-y) 0.5rem; @@ -1318,21 +1037,14 @@ code { text-align: right; border-right: 1px solid var(--border-primary); } - .line-numbers .line-number { display: block; line-height: var(--code-line-height); } - .highlight-with-lines .highlight { flex: 1; } - -.highlight .hll { - background-color: transparent; -} - -/* don't conflict with our highlight */ +.highlight .hll { background-color: transparent; } /* don't conflict with our highlight */ .highlight pre { white-space: pre; margin: 0; @@ -1344,37 +1056,177 @@ code { .cell-code.collapsed { display: none; } - .cell-code.expanded { display: block; } - { - % if config.collapse_code % -} - -.cell-code { - display: none; -} - - { - % else % -} - .cell-code { display: block; border-bottom: 1px solid var(--border-primary); } - { - % endif % -} - { - { - pygments_css - } -} +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="light"] .highlight .hll { background-color: #ffffcc } +[data-theme="light"] .highlight { background: #f8f8f8; } +[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ +[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */ +[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */ +[data-theme="light"] .highlight .o { color: #666 } /* Operator */ +[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ +[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ +[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */ +[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */ +[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */ +[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */ +[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */ +[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */ +[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */ +[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */ +[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */ +[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ +[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */ +[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */ +[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ +[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ +[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ +[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */ +[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ +[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */ +[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */ +[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */ +[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */ +[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */ +[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */ +[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */ +[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ +[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ +[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */ +[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */ +[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ +[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ +[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */ +[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */ +[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */ +[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */ +[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */ +[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */ +[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */ +[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ +[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ +[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */ +[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */ +[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ +[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */ +[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */ +[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */ +[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */ +[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */ +[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */ +[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ +[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */ +[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ +[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */ +[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */ +[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */ +[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */ +[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */ +[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */ + +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +[data-theme="dark"] .highlight .hll { background-color: #49483e } +[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 } +[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */ +[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */ +[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */ +[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */ +[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */ +[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */ +[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */ +[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */ +[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */ +[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */ +[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */ +[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */ +[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */ +[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */ +[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */ +[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */ +[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */ +[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */ +[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */ +[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */ +[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */ +[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */ +[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */ +[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */ +[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */ +[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */ +[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */ +[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */ +[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */ +[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */ +[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */ +[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */ +[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */ +[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */ +[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */ +[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */ +[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */ +[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */ +[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */ +[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */ +[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */ +[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */ +[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */ +[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */ +[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */ +[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */ +[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */ +[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */ +[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */ +[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */ +[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */ +[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */ +[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */ +[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */ +[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */ +[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */ +[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */ +[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */ +[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */ +[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */ +[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */ +[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */ +[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */ +[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */ +[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */ +[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */ +[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */ +[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */ +[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */ +[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */ +[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */ +[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */ +[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */ +[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */ +[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */ +[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */ +[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */ /* Ensure our code metrics override Pygments defaults */ .highlight pre { @@ -1386,76 +1238,23 @@ code { font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important; border: none; } - -.line-numbers { - line-height: var(--code-line-height) !important; -} - -.line-numbers .line-number { - line-height: var(--code-line-height) !important; -} +.line-numbers { line-height: var(--code-line-height) !important; } +.line-numbers .line-number { line-height: var(--code-line-height) !important; } /* Custom CSS from frontmatter */ - { - { - config.custom_css - } -} - - { - # Override code font size from frontmatter (accept number as px) # -} - - { - % if config.code_font_size is not none % -} - - { - % if config.code_font_size is string % -} - -:root { - --code-font-size: { - { - config.code_font_size - } - } - - ; -} - - { - % else % -} -:root { - --code-font-size: { - { - config.code_font_size - } - } - px; -} - { - % endif % -} - { - % endif % -} /* Cursor for tools */ -body[data-tool="arrow"] .main-content { +body[data-tool="arrow"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, crosshair; } - -body[data-tool="pen"] .main-content { +body[data-tool="pen"] .main-content { cursor: url('data:image/svg+xml;utf8,') 4 20, pointer; } - -body[data-tool="eraser"] .main-content { +body[data-tool="eraser"] .main-content { cursor: url('data:image/svg+xml;utf8,') 12 12, auto; } @@ -1468,14 +1267,12 @@ body[data-tool="eraser"] .main-content { text-transform: uppercase; letter-spacing: 0.5px; } - .color-row { display: grid; grid-template-columns: repeat(6, 1fr); gap: 0.25rem; margin-bottom: 0.5rem; } - .color-swatch { width: 18px; height: 18px; @@ -1485,17 +1282,14 @@ body[data-tool="eraser"] .main-content { transition: all 0.2s ease; position: relative; } - .color-swatch:hover { transform: scale(1.1); border-color: var(--text-secondary); } - .color-swatch.selected { border-color: var(--text-primary); box-shadow: 0 0 0 2px var(--text-link); } - .color-swatch.selected::after { content: '✓'; position: absolute; @@ -1507,7 +1301,6 @@ body[data-tool="eraser"] .main-content { font-weight: bold; text-shadow: 1px 1px 1px black; } - .color-input { width: 24px; height: 24px; @@ -1519,7 +1312,6 @@ body[data-tool="eraser"] .main-content { grid-column: span 2; justify-self: center; } - .color-input:hover { border-color: var(--text-secondary); } @@ -1531,7 +1323,6 @@ body[data-tool="eraser"] .main-content { gap: 0.5rem; margin-top: 0.75rem; } - .thickness-slider { flex: 1; -webkit-appearance: none; @@ -1543,11 +1334,9 @@ body[data-tool="eraser"] .main-content { opacity: 0.7; transition: opacity 0.2s; } - .thickness-slider:hover { opacity: 1; } - .thickness-slider::-webkit-slider-thumb { -webkit-appearance: none; appearance: none; @@ -1557,7 +1346,6 @@ body[data-tool="eraser"] .main-content { border-radius: 50%; cursor: pointer; } - .thickness-slider::-moz-range-thumb { width: 12px; height: 12px; @@ -1566,7 +1354,6 @@ body[data-tool="eraser"] .main-content { cursor: pointer; border: none; } - .thickness-value { font-size: 0.7rem; color: var(--text-secondary); @@ -1592,18 +1379,18 @@ body[data-tool="eraser"] .main-content { } @keyframes spin { - to { - transform: rotate(360deg); - } + to { transform: rotate(360deg); } } .loading-skeleton { display: inline-block; background: var(--bg-tertiary); - background: linear-gradient(90deg, - var(--bg-tertiary) 25%, - var(--bg-secondary) 50%, - var(--bg-tertiary) 75%); + background: linear-gradient( + 90deg, + var(--bg-tertiary) 25%, + var(--bg-secondary) 50%, + var(--bg-tertiary) 75% + ); background-size: 200% 100%; animation: loading-shimmer 2s ease-in-out infinite; border-radius: 2px; @@ -1613,13 +1400,8 @@ body[data-tool="eraser"] .main-content { } @keyframes loading-shimmer { - 0% { - background-position: -200% 0; - } - - 100% { - background-position: 200% 0; - } + 0% { background-position: -200% 0; } + 100% { background-position: 200% 0; } } /* Loading state for cell output */ @@ -4092,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
Fri Oct 31 20:00:00 2025 +Mon Nov 10 21:57:39 2025 +-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 32C P0 101W / 350W | 0MiB / 46068MiB | 100% Default | +| N/A 26C P0 88W / 350W | 0MiB / 46068MiB | 22% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -4155,7 +3937,7 @@ Cell: nv | 0.23s ▼ output ▶ uv-logs | -Cell: benchmark | 7.58s +Cell: benchmark | 38.43s | Raw @@ -4234,27 +4016,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.068ms 1195.27% 1.068ms 1.068ms 1 - torch_eager 14.00% 388.140us 99.71% 2.764ms 2.764ms 0.000us 0.00% 90.528us 90.528us 1 - aten::mul 6.16% 170.676us 10.43% 289.217us 12.051us 46.911us 52.52% 46.911us 1.955us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.911us 52.52% 46.911us 1.955us 24 - aten::copy_ 4.25% 117.935us 62.65% 1.737ms 96.500us 29.185us 32.68% 30.401us 1.689us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.561us 25.26% 22.561us 1.880us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.216us 14.80% 13.216us 1.101us 12 - aten::clone 1.62% 44.961us 61.78% 1.713ms 285.451us 0.000us 0.00% 7.840us 1.307us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 7.42% 6.624us 1.104us 6 - aten::sub 1.59% 44.071us 2.54% 70.301us 11.717us 6.624us 7.42% 6.624us 1.104us 6 - aten::add 1.26% 34.801us 2.08% 57.721us 9.620us 6.592us 7.38% 6.592us 1.099us 6 - Activity Buffer Request 53.17% 1.474ms 53.17% 1.474ms 1.474ms 1.216us 1.36% 1.216us 1.216us 1 - aten::empty_strided 2.35% 65.251us 2.35% 65.251us 10.875us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.98% 82.752us 2.98% 82.752us 13.792us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.05% 84.591us 4.03% 111.694us 4.654us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.98% 27.103us 0.98% 27.103us 1.129us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.29% 229.882us 8.29% 229.882us 4.789us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.29% 8.120us 0.29% 8.120us 8.120us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.315ms 1474.39% 1.315ms 1.315ms 1 + torch_eager 7.00% 401.548us 82.40% 4.729ms 4.729ms 0.000us 0.00% 90.432us 90.432us 1 + aten::mul 3.25% 186.430us 5.35% 307.044us 12.793us 46.943us 52.62% 46.943us 1.956us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 46.943us 52.62% 46.943us 1.956us 24 + aten::copy_ 2.48% 142.261us 48.48% 2.782ms 154.576us 29.122us 32.64% 30.338us 1.685us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.433us 25.14% 22.433us 1.869us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.151us 14.74% 13.151us 1.096us 12 + aten::clone 0.88% 50.441us 59.65% 3.423ms 570.575us 0.000us 0.00% 7.905us 1.318us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.689us 7.50% 6.689us 1.115us 6 + aten::sub 0.82% 47.350us 1.28% 73.411us 12.235us 6.591us 7.39% 6.591us 1.098us 6 + aten::add 0.64% 36.811us 1.04% 59.601us 9.934us 6.560us 7.35% 6.560us 1.093us 6 + Activity Buffer Request 39.92% 2.291ms 39.92% 2.291ms 2.291ms 1.216us 1.36% 1.216us 1.216us 1 + aten::empty_strided 16.52% 948.386us 16.52% 948.386us 158.064us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 1.38% 78.980us 1.38% 78.980us 13.163us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 1.46% 83.925us 1.86% 106.703us 4.446us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.40% 22.778us 0.40% 22.778us 0.949us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.66% 439.430us 7.66% 439.430us 9.155us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 17.60% 1.010ms 17.60% 1.010ms 1.010ms 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.772ms -Self CUDA time total: 89.312us +Self CPU time total: 5.740ms +Self CUDA time total: 89.216us @@ -4264,27 +4046,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 960.345us 1063.10% 960.345us 960.345us 1 - torch_eager 11.94% 304.272us 99.78% 2.543ms 2.543ms 0.000us 0.00% 91.454us 91.454us 1 - aten::mul 6.19% 157.625us 10.77% 274.398us 11.433us 47.776us 52.89% 47.776us 1.991us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.776us 52.89% 47.776us 1.991us 24 - aten::copy_ 4.14% 105.392us 66.58% 1.697ms 94.258us 29.343us 32.48% 30.463us 1.692us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.559us 24.97% 22.559us 1.880us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.215us 14.63% 13.215us 1.101us 12 - aten::clone 0.97% 24.733us 63.76% 1.625ms 270.825us 0.000us 0.00% 7.904us 1.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 7.51% 6.784us 1.131us 6 - aten::add 1.23% 31.452us 2.12% 54.072us 9.012us 6.623us 7.33% 6.623us 1.104us 6 - aten::sub 1.53% 39.032us 2.55% 64.964us 10.827us 6.592us 7.30% 6.592us 1.099us 6 - Activity Buffer Request 57.59% 1.468ms 57.59% 1.468ms 1.468ms 1.120us 1.24% 1.120us 1.120us 1 - aten::empty_strided 1.31% 33.410us 1.31% 33.410us 5.568us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.43% 61.963us 2.43% 61.963us 10.327us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.76% 70.222us 3.54% 90.271us 3.761us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.79% 20.049us 0.79% 20.049us 0.835us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.91% 226.937us 8.91% 226.937us 4.728us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.22% 5.590us 0.22% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 967.576us 1072.55% 967.576us 967.576us 1 + torch_eager 10.80% 301.919us 99.80% 2.790ms 2.790ms 0.000us 0.00% 91.365us 91.365us 1 + aten::mul 5.82% 162.824us 9.87% 275.997us 11.500us 47.523us 52.68% 47.523us 1.980us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 47.523us 52.68% 47.523us 1.980us 24 + aten::copy_ 4.18% 116.751us 70.01% 1.957ms 108.723us 29.282us 32.46% 30.434us 1.691us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.497us 24.94% 22.497us 1.875us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.408us 14.86% 13.408us 1.117us 12 + aten::clone 0.79% 22.172us 66.92% 1.871ms 311.782us 0.000us 0.00% 7.937us 1.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.785us 7.52% 6.785us 1.131us 6 + aten::add 1.23% 34.361us 2.02% 56.562us 9.427us 6.720us 7.45% 6.720us 1.120us 6 + aten::sub 1.36% 38.010us 2.19% 61.310us 10.218us 6.688us 7.41% 6.688us 1.115us 6 + Activity Buffer Request 61.66% 1.724ms 61.66% 1.724ms 1.724ms 1.152us 1.28% 1.152us 1.152us 1 + aten::empty_strided 1.16% 32.541us 1.16% 32.541us 5.424us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 2.01% 56.260us 2.01% 56.260us 9.377us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.34% 65.363us 2.94% 82.214us 3.426us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.60% 16.851us 0.60% 16.851us 0.702us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.84% 219.114us 7.84% 219.114us 4.565us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.580us 0.20% 5.580us 5.580us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.548ms -Self CUDA time total: 90.334us +Self CPU time total: 2.795ms +Self CUDA time total: 90.213us @@ -4294,27 +4076,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 989.616us 1051.23% 989.616us 989.616us 1 - torch_eager 12.09% 307.194us 99.76% 2.536ms 2.536ms 0.000us 0.00% 95.450us 95.450us 1 - aten::mul 6.35% 161.494us 11.09% 281.865us 11.744us 48.958us 52.01% 48.958us 2.040us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.958us 52.01% 48.958us 2.040us 24 - aten::copy_ 4.30% 109.293us 66.10% 1.680ms 93.343us 30.814us 32.73% 32.125us 1.785us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.943us 24.37% 22.943us 1.912us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.367us 15.26% 14.367us 1.197us 12 - aten::clone 0.97% 24.599us 62.75% 1.595ms 265.823us 0.000us 0.00% 9.182us 1.530us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.871us 8.36% 7.871us 1.312us 6 - aten::add 1.20% 30.579us 2.08% 52.891us 8.815us 7.199us 7.65% 7.199us 1.200us 6 - aten::sub 1.49% 37.871us 2.53% 64.231us 10.705us 7.168us 7.61% 7.168us 1.195us 6 - Activity Buffer Request 56.57% 1.438ms 56.57% 1.438ms 1.438ms 1.311us 1.39% 1.311us 1.311us 1 - aten::empty_strided 1.38% 35.041us 1.38% 35.041us 5.840us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 2.38% 60.441us 2.38% 60.441us 10.074us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.77% 70.298us 3.53% 89.841us 3.743us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.77% 19.543us 0.77% 19.543us 0.814us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.50% 241.544us 9.50% 241.544us 5.032us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.24% 6.100us 0.24% 6.100us 6.100us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 927.639us 987.31% 927.639us 927.639us 1 + torch_eager 10.07% 282.335us 99.80% 2.798ms 2.798ms 0.000us 0.00% 95.268us 95.268us 1 + aten::mul 5.75% 161.290us 9.68% 271.373us 11.307us 48.769us 51.91% 48.769us 2.032us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.769us 51.91% 48.769us 2.032us 24 + aten::copy_ 3.66% 102.626us 71.21% 1.996ms 110.912us 30.720us 32.70% 32.032us 1.780us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 24.39% 22.912us 1.909us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.467us 15.40% 14.467us 1.206us 12 + aten::clone 0.79% 22.060us 68.41% 1.918ms 319.628us 0.000us 0.00% 9.120us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 8.31% 7.808us 1.301us 6 + aten::sub 1.36% 38.040us 2.18% 61.002us 10.167us 7.265us 7.73% 7.265us 1.211us 6 + aten::add 1.15% 32.220us 1.90% 53.280us 8.880us 7.202us 7.67% 7.202us 1.200us 6 + Activity Buffer Request 63.51% 1.780ms 63.51% 1.780ms 1.780ms 1.312us 1.40% 1.312us 1.312us 1 + aten::empty_strided 1.12% 31.490us 1.12% 31.490us 5.248us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 1.87% 52.452us 1.87% 52.452us 8.742us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.25% 63.104us 2.86% 80.042us 3.335us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.60% 16.938us 0.60% 16.938us 0.706us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.67% 215.090us 7.67% 215.090us 4.481us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.20% 5.470us 0.20% 5.470us 5.470us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.542ms -Self CUDA time total: 94.139us +Self CPU time total: 2.803ms +Self CUDA time total: 93.956us @@ -4324,27 +4106,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 928.327us 916.02% 928.327us 928.327us 1 - torch_eager 12.51% 290.049us 99.77% 2.313ms 2.313ms 0.000us 0.00% 102.689us 102.689us 1 - aten::mul 6.36% 147.401us 11.12% 257.946us 10.748us 52.800us 52.10% 52.800us 2.200us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.800us 52.10% 52.800us 2.200us 24 - aten::copy_ 4.62% 107.204us 65.04% 1.508ms 83.777us 32.415us 31.99% 33.760us 1.876us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.511us 24.19% 24.511us 2.043us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.129us 15.92% 16.129us 1.344us 12 - aten::clone 0.98% 22.822us 61.74% 1.431ms 238.579us 0.000us 0.00% 9.249us 1.542us 6 - aten::add 1.37% 31.668us 2.34% 54.320us 9.053us 8.096us 7.99% 8.096us 1.349us 6 - aten::sub 1.57% 36.291us 2.61% 60.431us 10.072us 8.033us 7.93% 8.033us 1.339us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 7.80% 7.904us 1.317us 6 - Activity Buffer Request 46.02% 1.067ms 46.02% 1.067ms 1.067ms 1.345us 1.33% 1.345us 1.345us 1 - aten::empty_strided 1.38% 31.940us 1.38% 31.940us 5.323us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.71% 271.508us 11.71% 271.508us 45.251us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.99% 69.429us 3.79% 87.781us 3.658us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.79% 18.352us 0.79% 18.352us 0.765us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 9.47% 219.548us 9.47% 219.548us 4.574us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.23% 5.380us 0.23% 5.380us 5.380us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.847us 904.69% 918.847us 918.847us 1 + torch_eager 11.08% 278.185us 99.79% 2.506ms 2.506ms 0.000us 0.00% 102.877us 102.877us 1 + aten::mul 6.15% 154.372us 10.54% 264.762us 11.032us 52.638us 51.83% 52.638us 2.193us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.638us 51.83% 52.638us 2.193us 24 + aten::copy_ 4.16% 104.580us 68.26% 1.714ms 95.219us 32.416us 31.92% 33.728us 1.874us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 24.26% 24.641us 2.053us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.511us 16.26% 16.511us 1.376us 12 + aten::clone 0.84% 21.090us 65.15% 1.636ms 272.671us 0.000us 0.00% 9.087us 1.514us 6 + aten::sub 1.51% 38.031us 2.44% 61.190us 10.198us 8.288us 8.16% 8.288us 1.381us 6 + aten::add 1.29% 32.470us 2.19% 54.880us 9.147us 8.223us 8.10% 8.223us 1.371us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.775us 7.66% 7.775us 1.296us 6 + Activity Buffer Request 52.27% 1.312ms 52.27% 1.312ms 1.312ms 1.312us 1.29% 1.312us 1.312us 1 + aten::empty_strided 1.29% 32.302us 1.29% 32.302us 5.384us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 9.44% 236.943us 9.44% 236.943us 39.491us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.53% 63.496us 3.16% 79.393us 3.308us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.63% 15.897us 0.63% 15.897us 0.662us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 8.60% 215.892us 8.60% 215.892us 4.498us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.21% 5.340us 0.21% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.319ms -Self CUDA time total: 101.344us +Self CPU time total: 2.511ms +Self CUDA time total: 101.565us @@ -4354,27 +4136,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.018ms 1082.59% 1.018ms 1.018ms 1 - torch_eager 11.47% 329.955us 99.81% 2.870ms 2.870ms 0.000us 0.00% 95.358us 95.358us 1 - aten::mul 5.65% 162.614us 9.86% 283.677us 11.820us 49.056us 52.16% 49.056us 2.044us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.056us 52.16% 49.056us 2.044us 24 - aten::copy_ 3.88% 111.664us 68.17% 1.960ms 108.907us 30.720us 32.66% 32.032us 1.780us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 24.33% 22.880us 1.907us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.270us 15.17% 14.270us 1.189us 12 - aten::clone 1.07% 30.831us 65.73% 1.890ms 315.021us 0.000us 0.00% 9.152us 1.525us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 8.34% 7.840us 1.307us 6 - aten::add 1.15% 33.191us 2.07% 59.441us 9.907us 7.167us 7.62% 7.167us 1.194us 6 - aten::sub 1.59% 45.863us 2.59% 74.463us 12.411us 7.103us 7.55% 7.103us 1.184us 6 - Activity Buffer Request 50.07% 1.440ms 50.07% 1.440ms 1.440ms 1.312us 1.40% 1.312us 1.312us 1 - aten::empty_strided 1.26% 36.310us 1.26% 36.310us 6.052us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.96% 343.839us 11.96% 343.839us 57.306us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.64% 75.860us 3.31% 95.264us 3.969us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.67% 19.404us 0.67% 19.404us 0.809us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.38% 240.995us 8.38% 240.995us 5.021us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.330us 0.19% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 991.709us 1060.94% 991.709us 991.709us 1 + torch_eager 10.56% 336.649us 99.82% 3.183ms 3.183ms 0.000us 0.00% 94.755us 94.755us 1 + aten::mul 5.20% 165.794us 8.73% 278.295us 11.596us 48.674us 52.07% 48.674us 2.028us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 48.674us 52.07% 48.674us 2.028us 24 + aten::copy_ 3.76% 119.863us 72.07% 2.298ms 127.674us 30.622us 32.76% 31.902us 1.772us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.878us 24.47% 22.878us 1.907us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.179us 15.17% 14.179us 1.182us 12 + aten::clone 0.88% 28.161us 69.55% 2.218ms 369.616us 0.000us 0.00% 9.024us 1.504us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 8.28% 7.744us 1.291us 6 + aten::sub 1.28% 40.920us 2.05% 65.511us 10.918us 7.138us 7.64% 7.138us 1.190us 6 + aten::add 1.05% 33.330us 1.81% 57.620us 9.603us 7.041us 7.53% 7.041us 1.173us 6 + Activity Buffer Request 55.60% 1.773ms 55.60% 1.773ms 1.773ms 1.280us 1.37% 1.280us 1.280us 1 + aten::empty_strided 1.06% 33.640us 1.06% 33.640us 5.607us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.74% 342.585us 10.74% 342.585us 57.097us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.14% 68.349us 2.66% 84.959us 3.540us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.52% 16.610us 0.52% 16.610us 0.692us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.03% 224.072us 7.03% 224.072us 4.668us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.590us 0.18% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.876ms -Self CUDA time total: 94.046us +Self CPU time total: 3.189ms +Self CUDA time total: 93.475us @@ -4384,27 +4166,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 913.335us 900.40% 913.335us 913.335us 1 - torch_eager 10.58% 290.726us 99.81% 2.742ms 2.742ms 0.000us 0.00% 102.781us 102.781us 1 - aten::mul 5.30% 145.663us 9.31% 255.637us 10.652us 52.735us 51.99% 52.735us 2.197us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.735us 51.99% 52.735us 2.197us 24 - aten::copy_ 3.74% 102.751us 70.53% 1.937ms 107.622us 32.638us 32.18% 33.982us 1.888us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.735us 24.38% 24.735us 2.061us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.064us 15.84% 16.064us 1.339us 12 - aten::clone 0.88% 24.121us 67.96% 1.867ms 311.110us 0.000us 0.00% 9.247us 1.541us 6 - aten::sub 1.29% 35.411us 2.16% 59.202us 9.867us 8.033us 7.92% 8.033us 1.339us 6 - aten::add 1.13% 30.931us 1.93% 52.952us 8.825us 8.031us 7.92% 8.031us 1.339us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.903us 7.79% 7.903us 1.317us 6 - Activity Buffer Request 52.85% 1.452ms 52.85% 1.452ms 1.452ms 1.344us 1.32% 1.344us 1.344us 1 - aten::empty_strided 1.21% 33.351us 1.21% 33.351us 5.559us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.71% 321.577us 11.71% 321.577us 53.596us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.55% 69.990us 3.22% 88.522us 3.688us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.67% 18.532us 0.67% 18.532us 0.772us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.90% 216.969us 7.90% 216.969us 4.520us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.19% 5.091us 0.19% 5.091us 5.091us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 941.177us 926.36% 941.177us 941.177us 1 + torch_eager 9.56% 295.804us 99.83% 3.088ms 3.088ms 0.000us 0.00% 102.911us 102.911us 1 + aten::mul 5.03% 155.643us 8.60% 265.986us 11.083us 52.802us 51.97% 52.802us 2.200us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 52.802us 51.97% 52.802us 2.200us 24 + aten::copy_ 3.66% 113.330us 73.34% 2.269ms 126.052us 32.447us 31.94% 33.759us 1.876us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 24.31% 24.703us 2.059us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.350us 16.09% 16.350us 1.363us 12 + aten::clone 0.71% 21.820us 70.53% 2.182ms 363.694us 0.000us 0.00% 9.056us 1.509us 6 + aten::sub 1.30% 40.120us 2.07% 63.950us 10.658us 8.223us 8.09% 8.223us 1.370us 6 + aten::add 1.17% 36.201us 1.90% 58.931us 9.822us 8.127us 8.00% 8.127us 1.355us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.62% 7.744us 1.291us 6 + Activity Buffer Request 57.23% 1.771ms 57.23% 1.771ms 1.771ms 1.312us 1.29% 1.312us 1.312us 1 + aten::empty_strided 0.98% 30.371us 0.98% 30.371us 5.062us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.40% 321.885us 10.40% 321.885us 53.647us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.12% 65.592us 2.67% 82.622us 3.443us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.55% 17.030us 0.55% 17.030us 0.710us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.11% 219.985us 7.11% 219.985us 4.583us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.17% 5.340us 0.17% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.747ms -Self CUDA time total: 101.437us +Self CPU time total: 3.094ms +Self CUDA time total: 101.599us @@ -4414,27 +4196,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 929.433us 768.61% 929.433us 929.433us 1 - torch_eager 10.84% 297.701us 99.80% 2.742ms 2.742ms 0.000us 0.00% 122.716us 122.716us 1 - aten::mul 5.42% 148.850us 9.41% 258.632us 10.776us 62.014us 51.28% 62.014us 2.584us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.014us 51.28% 62.014us 2.584us 24 - aten::copy_ 3.77% 103.682us 70.14% 1.927ms 107.043us 39.328us 32.52% 41.120us 2.284us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.82% 28.800us 2.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.582us 16.19% 19.582us 1.632us 12 - aten::clone 0.88% 24.131us 67.45% 1.853ms 308.828us 0.000us 0.00% 12.320us 2.053us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.528us 8.71% 10.528us 1.755us 6 - aten::sub 1.29% 35.482us 2.16% 59.433us 9.905us 9.792us 8.10% 9.792us 1.632us 6 - aten::add 1.13% 31.104us 1.94% 53.172us 8.862us 9.790us 8.10% 9.790us 1.632us 6 - Activity Buffer Request 52.94% 1.454ms 52.94% 1.454ms 1.454ms 1.792us 1.48% 1.792us 1.792us 1 - aten::empty_strided 1.18% 32.542us 1.18% 32.542us 5.424us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 11.19% 307.407us 11.19% 307.407us 51.235us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.56% 70.268us 3.25% 89.361us 3.723us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.70% 19.093us 0.70% 19.093us 0.796us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 7.91% 217.262us 7.91% 217.262us 4.526us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.370us 0.20% 5.370us 5.370us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.963us 782.64% 943.963us 943.963us 1 + torch_eager 9.85% 301.136us 99.82% 3.051ms 3.051ms 0.000us 0.00% 122.468us 122.468us 1 + aten::mul 5.14% 157.189us 8.67% 264.988us 11.041us 61.985us 51.39% 61.985us 2.583us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.985us 51.39% 61.985us 2.583us 24 + aten::copy_ 3.53% 107.981us 72.58% 2.218ms 123.247us 39.362us 32.64% 41.218us 2.290us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.802us 23.88% 28.802us 2.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.265us 15.97% 19.265us 1.605us 12 + aten::clone 0.97% 29.629us 70.14% 2.144ms 357.356us 0.000us 0.00% 12.416us 2.069us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.560us 8.76% 10.560us 1.760us 6 + aten::add 1.14% 34.930us 1.90% 58.161us 9.693us 9.633us 7.99% 9.633us 1.606us 6 + aten::sub 1.25% 38.210us 2.05% 62.510us 10.418us 9.632us 7.99% 9.632us 1.605us 6 + Activity Buffer Request 57.00% 1.742ms 57.00% 1.742ms 1.742ms 1.856us 1.54% 1.856us 1.856us 1 + aten::empty_strided 1.01% 31.021us 1.01% 31.021us 5.170us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 10.03% 306.454us 10.03% 306.454us 51.076us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.23% 68.242us 2.79% 85.430us 3.560us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.56% 17.188us 0.56% 17.188us 0.716us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.10% 217.131us 7.10% 217.131us 4.524us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.390us 0.18% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.747ms -Self CUDA time total: 120.924us +Self CPU time total: 3.057ms +Self CUDA time total: 120.612us @@ -4444,27 +4226,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 942.082us 549.37% 942.082us 942.082us 1 - torch_eager 20.10% 308.752us 99.67% 1.531ms 1.531ms 0.000us 0.00% 174.365us 174.365us 1 - aten::mul 9.79% 150.414us 16.96% 260.516us 10.855us 89.056us 51.93% 89.056us 3.711us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.056us 51.93% 89.056us 3.711us 24 - aten::copy_ 6.91% 106.224us 46.22% 710.060us 39.448us 57.503us 33.53% 60.383us 3.355us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.511us 23.62% 40.511us 3.376us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.926us 14.54% 24.926us 2.077us 12 - aten::clone 1.37% 21.029us 40.87% 627.796us 104.633us 0.000us 0.00% 19.872us 3.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.992us 9.91% 16.992us 2.832us 6 - aten::sub 2.26% 34.730us 3.83% 58.781us 9.797us 12.479us 7.28% 12.479us 2.080us 6 - aten::add 2.00% 30.683us 3.45% 52.973us 8.829us 12.447us 7.26% 12.447us 2.075us 6 - Activity Buffer Request 16.15% 248.056us 16.15% 248.056us 248.056us 2.880us 1.68% 2.880us 2.880us 1 - aten::empty_strided 2.04% 31.392us 2.04% 31.392us 5.232us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.97% 291.479us 18.97% 291.479us 48.580us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.49% 68.986us 5.70% 87.586us 3.649us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.21% 18.600us 1.21% 18.600us 0.775us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 14.37% 220.744us 14.37% 220.744us 4.599us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.33% 5.080us 0.33% 5.080us 5.080us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 928.245us 538.18% 928.245us 928.245us 1 + torch_eager 19.14% 292.425us 99.66% 1.523ms 1.523ms 0.000us 0.00% 175.325us 175.325us 1 + aten::mul 10.16% 155.270us 17.20% 262.742us 10.948us 89.630us 51.97% 89.630us 3.735us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.630us 51.97% 89.630us 3.735us 24 + aten::copy_ 6.82% 104.170us 46.76% 714.441us 39.691us 57.920us 33.58% 60.768us 3.376us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.768us 23.64% 40.768us 3.397us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.927us 14.45% 24.927us 2.077us 12 + aten::clone 1.34% 20.471us 41.24% 630.180us 105.030us 0.000us 0.00% 20.000us 3.333us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.152us 9.94% 17.152us 2.859us 6 + aten::sub 2.56% 39.072us 4.07% 62.112us 10.352us 12.480us 7.24% 12.480us 2.080us 6 + aten::add 2.20% 33.610us 3.65% 55.810us 9.302us 12.447us 7.22% 12.447us 2.075us 6 + Activity Buffer Request 16.69% 254.944us 16.69% 254.944us 254.944us 2.848us 1.65% 2.848us 2.848us 1 + aten::empty_strided 2.04% 31.181us 2.04% 31.181us 5.197us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 19.06% 291.294us 19.06% 291.294us 48.549us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.37% 66.700us 5.47% 83.522us 3.480us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.10% 16.822us 1.10% 16.822us 0.701us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.19% 216.745us 14.19% 216.745us 4.516us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.34% 5.240us 0.34% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.536ms -Self CUDA time total: 171.485us +Self CPU time total: 1.528ms +Self CUDA time total: 172.477us @@ -4474,27 +4256,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 906.096us 748.31% 906.096us 906.096us 1 - torch_eager 18.91% 280.775us 99.66% 1.480ms 1.480ms 0.000us 0.00% 122.910us 122.910us 1 - aten::mul 10.01% 148.664us 17.45% 259.167us 10.799us 62.174us 51.35% 62.174us 2.591us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 62.174us 51.35% 62.174us 2.591us 24 - aten::copy_ 6.88% 102.100us 46.50% 690.526us 38.363us 39.392us 32.53% 41.216us 2.290us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.800us 23.78% 28.800us 2.400us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.520us 16.12% 19.520us 1.627us 12 - aten::clone 1.45% 21.579us 41.36% 614.176us 102.363us 0.000us 0.00% 12.416us 2.069us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 8.75% 10.592us 1.765us 6 - aten::sub 2.32% 34.432us 3.90% 57.973us 9.662us 9.760us 8.06% 9.760us 1.627us 6 - aten::add 2.12% 31.432us 3.61% 53.552us 8.925us 9.760us 8.06% 9.760us 1.627us 6 - Activity Buffer Request 17.05% 253.136us 17.05% 253.136us 253.136us 1.824us 1.51% 1.824us 1.824us 1 - aten::empty_strided 2.06% 30.533us 2.06% 30.533us 5.089us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.50% 274.717us 18.50% 274.717us 45.786us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.53% 67.311us 5.78% 85.812us 3.575us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.25% 18.501us 1.25% 18.501us 0.771us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 14.60% 216.737us 14.60% 216.737us 4.515us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.34% 4.981us 0.34% 4.981us 4.981us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 923.899us 767.46% 923.899us 923.899us 1 + torch_eager 19.14% 287.798us 99.65% 1.499ms 1.499ms 0.000us 0.00% 122.144us 122.144us 1 + aten::mul 10.49% 157.698us 17.70% 266.255us 11.094us 61.982us 51.49% 61.982us 2.583us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 61.982us 51.49% 61.982us 2.583us 24 + aten::copy_ 6.99% 105.118us 46.36% 697.187us 38.733us 39.264us 32.62% 41.024us 2.279us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.832us 23.95% 28.832us 2.403us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.138us 15.90% 19.138us 1.595us 12 + aten::clone 1.32% 19.822us 40.79% 613.519us 102.253us 0.000us 0.00% 12.192us 2.032us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.67% 10.432us 1.739us 6 + aten::sub 2.51% 37.801us 4.08% 61.341us 10.224us 9.570us 7.95% 9.570us 1.595us 6 + aten::add 2.16% 32.471us 3.63% 54.661us 9.110us 9.568us 7.95% 9.568us 1.595us 6 + Activity Buffer Request 16.71% 251.314us 16.71% 251.314us 251.314us 1.760us 1.46% 1.760us 1.760us 1 + aten::empty_strided 2.00% 30.060us 2.00% 30.060us 5.010us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 18.58% 279.394us 18.58% 279.394us 46.566us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.31% 64.750us 5.43% 81.609us 3.400us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.12% 16.859us 1.12% 16.859us 0.702us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.34% 215.648us 14.34% 215.648us 4.493us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.35% 5.220us 0.35% 5.220us 5.220us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.485ms -Self CUDA time total: 121.086us +Self CPU time total: 1.504ms +Self CUDA time total: 120.384us @@ -4504,27 +4286,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.294us 555.32% 954.294us 954.294us 1 - torch_eager 11.21% 307.269us 99.82% 2.735ms 2.735ms 0.000us 0.00% 174.694us 174.694us 1 - aten::mul 5.59% 153.258us 9.69% 265.580us 11.066us 89.476us 52.07% 89.476us 3.728us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.476us 52.07% 89.476us 3.728us 24 - aten::copy_ 3.78% 103.631us 69.46% 1.903ms 105.735us 57.505us 33.46% 60.353us 3.353us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.545us 23.59% 40.545us 3.379us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.865us 14.47% 24.865us 2.072us 12 - aten::clone 0.89% 24.491us 66.72% 1.828ms 304.733us 0.000us 0.00% 19.808us 3.301us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.87% 16.960us 2.827us 6 - aten::add 1.15% 31.480us 1.96% 53.761us 8.960us 12.448us 7.24% 12.448us 2.075us 6 - aten::sub 1.31% 35.801us 2.17% 59.462us 9.910us 12.417us 7.23% 12.417us 2.070us 6 - Activity Buffer Request 53.91% 1.477ms 53.91% 1.477ms 1.477ms 2.848us 1.66% 2.848us 2.848us 1 - aten::empty_strided 1.13% 30.930us 1.13% 30.930us 5.155us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.51% 260.666us 9.51% 260.666us 43.444us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.58% 70.761us 3.30% 90.449us 3.769us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.72% 19.688us 0.72% 19.688us 0.820us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.03% 220.086us 8.03% 220.086us 4.585us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.18% 5.030us 0.18% 5.030us 5.030us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.259us 547.68% 943.259us 943.259us 1 + torch_eager 9.82% 293.988us 99.82% 2.988ms 2.988ms 0.000us 0.00% 175.075us 175.075us 1 + aten::mul 5.17% 154.631us 8.81% 263.742us 10.989us 89.536us 51.99% 89.536us 3.731us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 89.536us 51.99% 89.536us 3.731us 24 + aten::copy_ 3.66% 109.570us 72.53% 2.171ms 120.590us 57.795us 33.56% 60.643us 3.369us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.835us 23.71% 40.835us 3.403us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.896us 14.46% 24.896us 2.075us 12 + aten::clone 0.74% 22.030us 69.74% 2.087ms 347.874us 0.000us 0.00% 19.808us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.85% 16.960us 2.827us 6 + aten::add 1.10% 32.890us 1.87% 55.840us 9.307us 12.481us 7.25% 12.481us 2.080us 6 + aten::sub 1.28% 38.273us 2.11% 63.142us 10.524us 12.415us 7.21% 12.415us 2.069us 6 + Activity Buffer Request 58.02% 1.736ms 58.02% 1.736ms 1.736ms 2.848us 1.65% 2.848us 2.848us 1 + aten::empty_strided 1.00% 30.050us 1.00% 30.050us 5.008us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.83% 264.325us 8.83% 264.325us 44.054us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.36% 70.650us 2.95% 88.161us 3.673us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.59% 17.511us 0.59% 17.511us 0.730us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.26% 217.282us 7.26% 217.282us 4.527us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.289us 0.18% 5.289us 5.289us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.740ms -Self CUDA time total: 171.846us +Self CPU time total: 2.993ms +Self CUDA time total: 172.227us @@ -4534,27 +4316,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 917.943us 324.46% 917.943us 917.943us 1 - torch_eager 18.90% 277.703us 99.65% 1.464ms 1.464ms 0.000us 0.00% 301.376us 301.376us 1 - aten::mul 9.84% 144.586us 17.44% 256.139us 10.672us 132.736us 46.92% 132.736us 5.531us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.736us 46.92% 132.736us 5.531us 24 - aten::copy_ 7.06% 103.765us 45.63% 670.307us 37.239us 109.119us 38.57% 127.583us 7.088us 18 - aten::clone 1.58% 23.262us 40.78% 599.096us 99.849us 0.000us 0.00% 70.336us 11.723us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.247us 20.23% 57.247us 4.771us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 51.872us 18.34% 51.872us 8.645us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.057us 14.51% 41.057us 3.421us 12 - aten::add 2.13% 31.271us 3.65% 53.632us 8.939us 20.545us 7.26% 20.545us 3.424us 6 - aten::sub 2.39% 35.109us 4.06% 59.711us 9.952us 20.512us 7.25% 20.512us 3.419us 6 - Activity Buffer Request 16.07% 236.106us 16.07% 236.106us 236.106us 18.464us 6.53% 18.464us 18.464us 1 - aten::empty_strided 2.35% 34.500us 2.35% 34.500us 5.750us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 18.36% 269.767us 18.36% 269.767us 44.961us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.78% 70.183us 6.04% 88.753us 3.698us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.26% 18.570us 1.26% 18.570us 0.774us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 14.92% 219.185us 14.92% 219.185us 4.566us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 5.090us 0.35% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 922.006us 322.11% 922.006us 922.006us 1 + torch_eager 19.42% 278.764us 99.64% 1.431ms 1.431ms 0.000us 0.00% 304.543us 304.543us 1 + aten::mul 10.68% 153.400us 18.09% 259.803us 10.825us 134.112us 46.85% 134.112us 5.588us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.112us 46.85% 134.112us 5.588us 24 + aten::copy_ 7.65% 109.831us 44.83% 643.670us 35.759us 111.232us 38.86% 129.536us 7.196us 18 + aten::clone 1.43% 20.539us 38.82% 557.349us 92.892us 0.000us 0.00% 72.160us 12.027us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.376us 20.04% 57.376us 4.781us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.856us 18.82% 53.856us 8.976us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.895us 14.29% 40.895us 3.408us 12 + aten::sub 2.68% 38.501us 4.30% 61.692us 10.282us 20.543us 7.18% 20.543us 3.424us 6 + aten::add 2.29% 32.829us 3.81% 54.730us 9.122us 20.352us 7.11% 20.352us 3.392us 6 + Activity Buffer Request 16.08% 230.904us 16.08% 230.904us 230.904us 18.304us 6.39% 18.304us 18.304us 1 + aten::empty_strided 2.06% 29.601us 2.06% 29.601us 4.933us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 16.83% 241.674us 16.83% 241.674us 40.279us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.51% 64.754us 5.69% 81.743us 3.406us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.18% 16.989us 1.18% 16.989us 0.708us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 14.82% 212.756us 14.82% 212.756us 4.432us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.240us 0.36% 5.240us 5.240us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.469ms -Self CUDA time total: 282.912us +Self CPU time total: 1.436ms +Self CUDA time total: 286.239us @@ -4564,27 +4346,27 @@ PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.832us 165.35% 931.832us 931.832us 1 - torch_eager 19.27% 283.137us 99.64% 1.464ms 1.464ms 0.000us 0.00% 587.261us 587.261us 1 - aten::copy_ 7.04% 103.435us 44.90% 659.587us 36.644us 272.511us 48.36% 296.223us 16.457us 18 - aten::mul 10.36% 152.225us 18.18% 267.110us 11.130us 224.829us 39.90% 224.829us 9.368us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 224.829us 39.90% 224.829us 9.368us 24 - aten::clone 1.47% 21.550us 39.53% 580.673us 96.779us 0.000us 0.00% 205.855us 34.309us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.143us 32.32% 182.143us 30.357us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.368us 16.04% 90.368us 7.531us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 66.209us 11.75% 66.209us 5.517us 12 - aten::sub 2.39% 35.041us 4.07% 59.831us 9.972us 33.760us 5.99% 33.760us 5.627us 6 - aten::add 2.15% 31.591us 3.70% 54.401us 9.067us 32.449us 5.76% 32.449us 5.408us 6 - Activity Buffer Request 16.23% 238.406us 16.23% 238.406us 238.406us 23.712us 4.21% 23.712us 23.712us 1 - aten::empty_strided 2.04% 29.960us 2.04% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.39% 255.475us 17.39% 255.475us 42.579us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.73% 69.441us 6.00% 88.092us 3.670us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.27% 18.651us 1.27% 18.651us 0.777us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.30% 224.756us 15.30% 224.756us 4.682us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.36% 5.280us 0.36% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 970.352us 169.72% 970.352us 970.352us 1 + torch_eager 19.50% 289.365us 99.64% 1.478ms 1.478ms 0.000us 0.00% 595.480us 595.480us 1 + aten::copy_ 7.05% 104.551us 43.31% 642.598us 35.700us 273.596us 47.85% 297.340us 16.519us 18 + aten::mul 11.63% 172.532us 19.46% 288.666us 12.028us 232.863us 40.73% 232.863us 9.703us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.863us 40.73% 232.863us 9.703us 24 + aten::clone 1.45% 21.521us 37.67% 558.878us 93.146us 0.000us 0.00% 205.949us 34.325us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 182.205us 31.87% 182.205us 30.367us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.391us 15.98% 91.391us 7.616us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.277us 11.42% 65.277us 5.440us 12 + aten::sub 2.70% 40.111us 4.36% 64.701us 10.784us 32.768us 5.73% 32.768us 5.461us 6 + aten::add 2.31% 34.320us 3.88% 57.510us 9.585us 32.509us 5.69% 32.509us 5.418us 6 + Activity Buffer Request 17.48% 259.324us 17.48% 259.324us 259.324us 23.744us 4.15% 23.744us 23.744us 1 + aten::empty_strided 2.00% 29.720us 2.00% 29.720us 4.953us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 14.68% 217.742us 14.68% 217.742us 36.290us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.50% 66.694us 5.68% 84.252us 3.511us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.18% 17.558us 1.18% 17.558us 0.732us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.16% 224.895us 15.16% 224.895us 4.685us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.36% 5.340us 0.36% 5.340us 5.340us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.469ms -Self CUDA time total: 563.549us +Self CPU time total: 1.484ms +Self CUDA time total: 571.736us @@ -4594,27 +4376,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 948.157us 1025.28% 948.157us 948.157us 1 - torch_eager 11.31% 303.890us 99.80% 2.681ms 2.681ms 0.000us 0.00% 93.597us 93.597us 1 - aten::mul 5.70% 153.152us 9.94% 267.009us 11.125us 49.696us 53.74% 49.696us 2.071us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.696us 53.74% 49.696us 2.071us 24 - aten::copy_ 3.75% 100.883us 69.10% 1.857ms 103.143us 29.375us 31.76% 30.494us 1.694us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.592us 24.43% 22.592us 1.883us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.407us 14.50% 13.407us 1.117us 12 - aten::clone 0.85% 22.792us 66.32% 1.782ms 296.986us 0.000us 0.00% 7.902us 1.317us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 7.33% 6.783us 1.131us 6 - aten::sub 1.31% 35.191us 2.17% 58.341us 9.724us 6.720us 7.27% 6.720us 1.120us 6 - aten::add 1.15% 30.820us 1.98% 53.181us 8.863us 6.687us 7.23% 6.687us 1.114us 6 - Activity Buffer Request 53.95% 1.449ms 53.95% 1.449ms 1.449ms 1.119us 1.21% 1.119us 1.119us 1 - aten::empty_strided 1.15% 30.830us 1.15% 30.830us 5.138us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 9.13% 245.326us 9.13% 245.326us 40.888us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.61% 70.171us 3.31% 88.830us 3.701us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.69% 18.659us 0.69% 18.659us 0.777us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.20% 220.298us 8.20% 220.298us 4.590us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.250us 0.20% 5.250us 5.250us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 936.155us 1011.59% 936.155us 936.155us 1 + torch_eager 9.66% 281.404us 99.82% 2.908ms 2.908ms 0.000us 0.00% 93.663us 93.663us 1 + aten::mul 5.48% 159.764us 9.36% 272.564us 11.357us 49.568us 53.56% 49.568us 2.065us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 49.568us 53.56% 49.568us 2.065us 24 + aten::copy_ 3.70% 107.711us 72.25% 2.105ms 116.944us 29.407us 31.78% 30.527us 1.696us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.591us 24.41% 22.591us 1.883us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.568us 14.66% 13.568us 1.131us 12 + aten::clone 0.74% 21.551us 69.34% 2.020ms 336.695us 0.000us 0.00% 7.936us 1.323us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 7.37% 6.816us 1.136us 6 + aten::sub 1.31% 38.128us 2.13% 61.912us 10.319us 6.815us 7.36% 6.815us 1.136us 6 + aten::add 1.08% 31.450us 1.84% 53.600us 8.933us 6.753us 7.30% 6.753us 1.126us 6 + Activity Buffer Request 59.75% 1.741ms 59.75% 1.741ms 1.741ms 1.120us 1.21% 1.120us 1.120us 1 + aten::empty_strided 1.04% 30.170us 1.04% 30.170us 5.028us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.73% 196.044us 6.73% 196.044us 32.674us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.24% 65.300us 2.82% 82.022us 3.418us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.57% 16.722us 0.57% 16.722us 0.697us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.53% 219.305us 7.53% 219.305us 4.569us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.160us 0.18% 5.160us 5.160us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.687ms -Self CUDA time total: 92.478us +Self CPU time total: 2.913ms +Self CUDA time total: 92.543us @@ -4624,27 +4406,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 924.823us 959.84% 924.823us 924.823us 1 - torch_eager 19.47% 279.525us 99.65% 1.430ms 1.430ms 0.000us 0.00% 97.664us 97.664us 1 - aten::mul 10.27% 147.364us 19.04% 273.370us 11.390us 51.165us 53.10% 51.165us 2.132us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.165us 53.10% 51.165us 2.132us 24 - aten::copy_ 7.14% 102.519us 43.74% 627.869us 34.882us 30.913us 32.08% 32.225us 1.790us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 23.040us 23.91% 23.040us 1.920us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.274us 14.81% 14.274us 1.189us 12 - aten::clone 1.45% 20.838us 38.33% 550.144us 91.691us 0.000us 0.00% 9.185us 1.531us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.873us 8.17% 7.873us 1.312us 6 - aten::add 2.18% 31.279us 3.75% 53.900us 8.983us 7.137us 7.41% 7.137us 1.189us 6 - aten::sub 2.45% 35.101us 4.11% 58.931us 9.822us 7.137us 7.41% 7.137us 1.189us 6 - Activity Buffer Request 15.34% 220.215us 15.34% 220.215us 220.215us 1.312us 1.36% 1.312us 1.312us 1 - aten::empty_strided 2.15% 30.891us 2.15% 30.891us 5.148us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.11% 245.545us 17.11% 245.545us 40.924us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.62% 66.322us 5.93% 85.082us 3.545us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.31% 18.760us 1.31% 18.760us 0.782us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 16.17% 232.047us 16.17% 232.047us 4.834us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 5.041us 0.35% 5.041us 5.041us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 918.262us 956.86% 918.262us 918.262us 1 + torch_eager 20.02% 274.163us 99.62% 1.364ms 1.364ms 0.000us 0.00% 97.279us 97.279us 1 + aten::mul 11.52% 157.766us 19.39% 265.646us 11.069us 51.167us 53.32% 51.167us 2.132us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 51.167us 53.32% 51.167us 2.132us 24 + aten::copy_ 7.76% 106.268us 42.02% 575.576us 31.976us 30.720us 32.01% 32.033us 1.780us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.912us 23.88% 22.912us 1.909us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 14.079us 14.67% 14.079us 1.173us 12 + aten::clone 1.48% 20.322us 36.02% 493.298us 82.216us 0.000us 0.00% 9.121us 1.520us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.808us 8.14% 7.808us 1.301us 6 + aten::sub 2.81% 38.541us 4.49% 61.481us 10.247us 7.072us 7.37% 7.072us 1.179us 6 + aten::add 2.42% 33.131us 4.04% 55.302us 9.217us 7.007us 7.30% 7.007us 1.168us 6 + Activity Buffer Request 16.17% 221.544us 16.17% 221.544us 221.544us 1.313us 1.37% 1.313us 1.313us 1 + aten::empty_strided 2.33% 31.950us 2.33% 31.950us 5.325us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.69% 187.513us 13.69% 187.513us 31.252us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.61% 63.101us 5.84% 79.961us 3.332us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.23% 16.860us 1.23% 16.860us 0.702us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.57% 213.242us 15.57% 213.242us 4.443us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.38% 5.270us 0.38% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.435ms -Self CUDA time total: 96.352us +Self CPU time total: 1.370ms +Self CUDA time total: 95.966us @@ -4654,27 +4436,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 915.886us 880.13% 915.886us 915.886us 1 - torch_eager 19.45% 278.057us 99.65% 1.425ms 1.425ms 0.000us 0.00% 105.374us 105.374us 1 - aten::mul 10.44% 149.250us 18.09% 258.645us 10.777us 55.325us 53.17% 55.325us 2.305us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.325us 53.17% 55.325us 2.305us 24 - aten::copy_ 7.22% 103.283us 44.53% 636.707us 35.373us 32.575us 31.30% 33.887us 1.883us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 23.74% 24.703us 2.059us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.162us 15.53% 16.162us 1.347us 12 - aten::clone 1.49% 21.291us 38.97% 557.204us 92.867us 0.000us 0.00% 9.184us 1.531us 6 - aten::sub 2.42% 34.610us 4.09% 58.491us 9.749us 8.096us 7.78% 8.096us 1.349us 6 - aten::add 2.18% 31.210us 3.76% 53.710us 8.952us 8.066us 7.75% 8.066us 1.344us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.872us 7.56% 7.872us 1.312us 6 - Activity Buffer Request 15.88% 227.005us 15.88% 227.005us 227.005us 1.312us 1.26% 1.312us 1.312us 1 - aten::empty_strided 2.12% 30.341us 2.12% 30.341us 5.057us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.11% 244.667us 17.11% 244.667us 40.778us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.81% 68.755us 6.12% 87.484us 3.645us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.31% 18.729us 1.31% 18.729us 0.780us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.21% 217.528us 15.21% 217.528us 4.532us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 5.011us 0.35% 5.011us 5.011us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 929.528us 892.96% 929.528us 929.528us 1 + torch_eager 20.25% 278.528us 99.63% 1.370ms 1.370ms 0.000us 0.00% 105.439us 105.439us 1 + aten::mul 11.59% 159.422us 19.60% 269.583us 11.233us 55.326us 53.15% 55.326us 2.305us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.326us 53.15% 55.326us 2.305us 24 + aten::copy_ 7.64% 105.130us 41.59% 572.021us 31.779us 32.351us 31.08% 33.695us 1.872us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.607us 23.64% 24.607us 2.051us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.418us 15.77% 16.418us 1.368us 12 + aten::clone 1.49% 20.431us 35.49% 488.057us 81.343us 0.000us 0.00% 9.088us 1.515us 6 + aten::sub 2.60% 35.723us 4.36% 59.953us 9.992us 8.258us 7.93% 8.258us 1.376us 6 + aten::add 2.46% 33.770us 4.07% 55.940us 9.323us 8.160us 7.84% 8.160us 1.360us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.744us 7.44% 7.744us 1.291us 6 + Activity Buffer Request 16.10% 221.454us 16.10% 221.454us 221.454us 1.344us 1.29% 1.344us 1.344us 1 + aten::empty_strided 2.25% 30.990us 2.25% 30.990us 5.165us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.30% 182.863us 13.30% 182.863us 30.477us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.81% 66.212us 6.02% 82.825us 3.451us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.21% 16.613us 1.21% 16.613us 0.692us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.93% 219.135us 15.93% 219.135us 4.565us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.37% 5.090us 0.37% 5.090us 5.090us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.430ms -Self CUDA time total: 104.062us +Self CPU time total: 1.375ms +Self CUDA time total: 104.095us @@ -4684,27 +4466,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 926.227us 747.17% 926.227us 926.227us 1 - torch_eager 10.87% 288.725us 99.79% 2.651ms 2.651ms 0.000us 0.00% 125.755us 125.755us 1 - aten::mul 5.66% 150.315us 9.84% 261.507us 10.896us 65.119us 52.53% 65.119us 2.713us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.119us 52.53% 65.119us 2.713us 24 - aten::copy_ 3.77% 100.152us 69.45% 1.845ms 102.495us 39.455us 31.83% 41.246us 2.291us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.928us 23.34% 28.928us 2.411us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.390us 15.64% 19.390us 1.616us 12 - aten::clone 0.89% 23.522us 66.73% 1.773ms 295.426us 0.000us 0.00% 12.318us 2.053us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.527us 8.49% 10.527us 1.755us 6 - aten::add 1.16% 30.840us 2.00% 53.221us 8.870us 9.759us 7.87% 9.759us 1.626us 6 - aten::sub 1.31% 34.853us 2.22% 58.863us 9.811us 9.631us 7.77% 9.631us 1.605us 6 - Activity Buffer Request 54.50% 1.448ms 54.50% 1.448ms 1.448ms 1.791us 1.44% 1.791us 1.791us 1 - aten::empty_strided 1.16% 30.740us 1.16% 30.740us 5.123us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.93% 237.245us 8.93% 237.245us 39.541us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.65% 70.502us 3.36% 89.223us 3.718us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.70% 18.721us 0.70% 18.721us 0.780us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.19% 217.516us 8.19% 217.516us 4.532us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.590us 0.21% 5.590us 5.590us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 943.134us 762.57% 943.134us 943.134us 1 + torch_eager 9.91% 288.756us 99.81% 2.907ms 2.907ms 0.000us 0.00% 125.503us 125.503us 1 + aten::mul 5.47% 159.428us 9.14% 266.247us 11.094us 65.088us 52.63% 65.088us 2.712us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.088us 52.63% 65.088us 2.712us 24 + aten::copy_ 3.82% 111.411us 72.08% 2.100ms 116.650us 39.391us 31.85% 41.215us 2.290us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.799us 23.29% 28.799us 2.400us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.200us 15.52% 19.200us 1.600us 12 + aten::clone 0.71% 20.821us 69.14% 2.014ms 335.649us 0.000us 0.00% 12.416us 2.069us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.592us 8.56% 10.592us 1.765us 6 + aten::sub 1.35% 39.440us 2.20% 63.980us 10.663us 9.632us 7.79% 9.632us 1.605us 6 + aten::add 1.16% 33.802us 1.92% 55.961us 9.327us 9.568us 7.74% 9.568us 1.595us 6 + Activity Buffer Request 59.81% 1.742ms 59.81% 1.742ms 1.742ms 1.824us 1.47% 1.824us 1.824us 1 + aten::empty_strided 1.06% 30.871us 1.06% 30.871us 5.145us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.32% 184.202us 6.32% 184.202us 30.700us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.20% 64.120us 2.78% 80.888us 3.370us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.58% 16.768us 0.58% 16.768us 0.699us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.39% 215.298us 7.39% 215.298us 4.485us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.660us 0.19% 5.660us 5.660us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.656ms -Self CUDA time total: 123.964us +Self CPU time total: 2.913ms +Self CUDA time total: 123.679us @@ -4714,27 +4496,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 925.174us 889.06% 925.174us 925.174us 1 - torch_eager 20.56% 293.458us 99.64% 1.423ms 1.423ms 0.000us 0.00% 105.438us 105.438us 1 - aten::mul 10.42% 148.708us 18.32% 261.500us 10.896us 55.264us 53.11% 55.264us 2.303us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.264us 53.11% 55.264us 2.303us 24 - aten::copy_ 7.08% 101.081us 43.33% 618.656us 34.370us 32.670us 31.39% 34.046us 1.891us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 23.74% 24.703us 2.059us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.128us 15.50% 16.128us 1.344us 12 - aten::clone 1.49% 21.220us 38.03% 542.913us 90.485us 0.000us 0.00% 9.343us 1.557us 6 - aten::sub 2.38% 33.992us 4.03% 57.481us 9.580us 8.064us 7.75% 8.064us 1.344us 6 - aten::add 2.21% 31.510us 3.80% 54.250us 9.042us 8.064us 7.75% 8.064us 1.344us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.967us 7.66% 7.967us 1.328us 6 - Activity Buffer Request 14.99% 214.036us 14.99% 214.036us 214.036us 1.376us 1.32% 1.376us 1.376us 1 - aten::empty_strided 2.13% 30.461us 2.13% 30.461us 5.077us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.05% 243.458us 17.05% 243.458us 40.576us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.68% 66.831us 5.99% 85.500us 3.562us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.31% 18.669us 1.31% 18.669us 0.778us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.35% 219.102us 15.35% 219.102us 4.565us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.36% 5.101us 0.36% 5.101us 5.101us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 926.451us 888.37% 926.451us 926.451us 1 + torch_eager 20.56% 277.090us 99.61% 1.342ms 1.342ms 0.000us 0.00% 105.599us 105.599us 1 + aten::mul 11.75% 158.363us 19.88% 267.883us 11.162us 55.423us 53.14% 55.423us 2.309us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 55.423us 53.14% 55.423us 2.309us 24 + aten::copy_ 7.94% 107.035us 40.62% 547.383us 30.410us 32.352us 31.02% 33.664us 1.870us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 24.640us 23.63% 24.640us 2.053us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.512us 15.83% 16.512us 1.376us 12 + aten::clone 1.47% 19.840us 34.29% 462.099us 77.016us 0.000us 0.00% 9.024us 1.504us 6 + aten::sub 2.93% 39.461us 4.68% 63.054us 10.509us 8.287us 7.95% 8.287us 1.381us 6 + aten::add 2.50% 33.680us 4.16% 56.100us 9.350us 8.225us 7.89% 8.225us 1.371us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 7.712us 7.39% 7.712us 1.285us 6 + Activity Buffer Request 14.74% 198.654us 14.74% 198.654us 198.654us 1.312us 1.26% 1.312us 1.312us 1 + aten::empty_strided 2.26% 30.481us 2.26% 30.481us 5.080us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.39% 180.523us 13.39% 180.523us 30.087us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.73% 63.708us 5.98% 80.630us 3.360us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 16.922us 1.26% 16.922us 0.705us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.08% 216.704us 16.08% 216.704us 4.515us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.39% 5.231us 0.39% 5.231us 5.231us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.428ms -Self CUDA time total: 104.062us +Self CPU time total: 1.348ms +Self CUDA time total: 104.287us @@ -4744,27 +4526,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 901.909us 727.35% 901.909us 901.909us 1 - torch_eager 19.87% 274.810us 99.60% 1.377ms 1.377ms 0.000us 0.00% 125.791us 125.791us 1 - aten::mul 10.85% 149.967us 18.79% 259.807us 10.825us 65.086us 52.49% 65.086us 2.712us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.086us 52.49% 65.086us 2.712us 24 - aten::copy_ 7.46% 103.216us 42.83% 592.168us 32.898us 39.518us 31.87% 41.310us 2.295us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.862us 23.28% 28.862us 2.405us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.395us 15.64% 19.395us 1.616us 12 - aten::clone 1.61% 22.200us 37.56% 519.385us 86.564us 0.000us 0.00% 12.448us 2.075us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.656us 8.59% 10.656us 1.776us 6 - aten::add 2.23% 30.899us 3.81% 52.660us 8.777us 9.730us 7.85% 9.730us 1.622us 6 - aten::sub 2.44% 33.801us 4.13% 57.151us 9.525us 9.665us 7.79% 9.665us 1.611us 6 - Activity Buffer Request 13.62% 188.345us 13.62% 188.345us 188.345us 1.792us 1.45% 1.792us 1.792us 1 - aten::empty_strided 2.34% 32.371us 2.34% 32.371us 5.395us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 17.39% 240.467us 17.39% 240.467us 40.078us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.87% 67.397us 6.22% 86.038us 3.585us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.35% 18.641us 1.35% 18.641us 0.777us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.56% 215.091us 15.56% 215.091us 4.481us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.40% 5.540us 0.40% 5.540us 5.540us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.662us 754.64% 931.662us 931.662us 1 + torch_eager 20.88% 278.302us 99.60% 1.328ms 1.328ms 0.000us 0.00% 125.281us 125.281us 1 + aten::mul 11.71% 156.112us 20.55% 273.936us 11.414us 65.153us 52.77% 65.153us 2.715us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 65.153us 52.77% 65.153us 2.715us 24 + aten::copy_ 7.95% 105.951us 39.52% 526.779us 29.265us 39.169us 31.73% 40.993us 2.277us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 28.737us 23.28% 28.737us 2.395us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 19.135us 15.50% 19.135us 1.595us 12 + aten::clone 1.44% 19.200us 33.27% 443.406us 73.901us 0.000us 0.00% 12.256us 2.043us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 10.432us 8.45% 10.432us 1.739us 6 + aten::sub 2.81% 37.440us 4.58% 61.110us 10.185us 9.632us 7.80% 9.632us 1.605us 6 + aten::add 2.52% 33.611us 4.17% 55.611us 9.268us 9.503us 7.70% 9.503us 1.584us 6 + Activity Buffer Request 13.21% 176.083us 13.21% 176.083us 176.083us 1.824us 1.48% 1.824us 1.824us 1 + aten::empty_strided 2.29% 30.570us 2.29% 30.570us 5.095us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.82% 184.192us 13.82% 184.192us 30.699us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.90% 65.274us 6.16% 82.123us 3.422us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 16.849us 1.26% 16.849us 0.702us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.81% 224.047us 16.81% 224.047us 4.668us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.310us 0.40% 5.310us 5.310us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.383ms -Self CUDA time total: 123.999us +Self CPU time total: 1.333ms +Self CUDA time total: 123.457us @@ -4774,27 +4556,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.661us 533.26% 944.661us 944.661us 1 - torch_eager 10.70% 284.298us 99.79% 2.652ms 2.652ms 0.000us 0.00% 180.029us 180.029us 1 - aten::mul 6.06% 161.074us 10.27% 272.980us 11.374us 94.781us 53.50% 94.781us 3.949us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.781us 53.50% 94.781us 3.949us 24 - aten::copy_ 3.97% 105.392us 69.06% 1.835ms 101.961us 57.664us 32.55% 60.545us 3.364us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.607us 22.92% 40.607us 3.384us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.703us 13.94% 24.703us 2.059us 12 - aten::clone 0.89% 23.759us 66.19% 1.759ms 293.179us 0.000us 0.00% 19.938us 3.323us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.057us 9.63% 17.057us 2.843us 6 - aten::sub 1.37% 36.511us 2.33% 61.971us 10.329us 12.383us 6.99% 12.383us 2.064us 6 - aten::add 1.17% 31.070us 2.01% 53.400us 8.900us 12.320us 6.95% 12.320us 2.053us 6 - Activity Buffer Request 53.91% 1.433ms 53.91% 1.433ms 1.433ms 2.881us 1.63% 2.881us 2.881us 1 - aten::empty_strided 1.17% 31.132us 1.17% 31.132us 5.189us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.85% 235.245us 8.85% 235.245us 39.208us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.64% 70.123us 3.36% 89.202us 3.717us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.72% 19.079us 0.72% 19.079us 0.795us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.35% 221.788us 8.35% 221.788us 4.621us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.21% 5.460us 0.21% 5.460us 5.460us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 944.092us 532.26% 944.092us 944.092us 1 + torch_eager 9.66% 282.874us 99.81% 2.921ms 2.921ms 0.000us 0.00% 180.253us 180.253us 1 + aten::mul 5.51% 161.402us 9.28% 271.603us 11.317us 95.040us 53.58% 95.040us 3.960us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.040us 53.58% 95.040us 3.960us 24 + aten::copy_ 3.62% 106.065us 72.07% 2.109ms 117.193us 57.663us 32.51% 60.543us 3.364us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.703us 22.95% 40.703us 3.392us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.670us 13.91% 24.670us 2.056us 12 + aten::clone 0.77% 22.428us 69.22% 2.026ms 337.680us 0.000us 0.00% 19.840us 3.307us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.960us 9.56% 16.960us 2.827us 6 + aten::add 1.16% 34.010us 1.95% 57.150us 9.525us 12.383us 6.98% 12.383us 2.064us 6 + aten::sub 1.32% 38.563us 2.15% 62.972us 10.495us 12.287us 6.93% 12.287us 2.048us 6 + Activity Buffer Request 59.97% 1.755ms 59.97% 1.755ms 1.755ms 2.880us 1.62% 2.880us 2.880us 1 + aten::empty_strided 1.05% 30.691us 1.05% 30.691us 5.115us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.31% 184.633us 6.31% 184.633us 30.772us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.32% 67.977us 2.88% 84.170us 3.507us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.55% 16.193us 0.55% 16.193us 0.675us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.56% 221.262us 7.56% 221.262us 4.610us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.19% 5.669us 0.19% 5.669us 5.669us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.657ms -Self CUDA time total: 177.148us +Self CPU time total: 2.927ms +Self CUDA time total: 177.373us @@ -4804,27 +4586,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 954.138us 321.69% 954.138us 954.138us 1 - torch_eager 11.45% 309.471us 99.80% 2.697ms 2.697ms 0.000us 0.00% 313.854us 313.854us 1 - aten::mul 5.62% 151.933us 9.84% 265.955us 11.081us 144.896us 48.85% 144.896us 6.037us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 144.896us 48.85% 144.896us 6.037us 24 - aten::copy_ 3.99% 107.722us 68.69% 1.856ms 103.120us 111.039us 37.44% 128.287us 7.127us 18 - aten::clone 1.05% 28.369us 65.82% 1.779ms 296.444us 0.000us 0.00% 70.944us 11.824us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.33% 57.343us 4.779us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.696us 18.10% 53.696us 8.949us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.671us 13.71% 40.671us 3.389us 12 - aten::sub 1.32% 35.620us 2.23% 60.211us 10.035us 20.448us 6.89% 20.448us 3.408us 6 - aten::add 1.16% 31.420us 1.99% 53.831us 8.972us 20.223us 6.82% 20.223us 3.371us 6 - Activity Buffer Request 53.66% 1.450ms 53.66% 1.450ms 1.450ms 17.248us 5.82% 17.248us 17.248us 1 - aten::empty_strided 1.25% 33.832us 1.25% 33.832us 5.639us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 8.57% 231.556us 8.57% 231.556us 38.593us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 2.58% 69.773us 3.29% 88.953us 3.706us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.71% 19.180us 0.71% 19.180us 0.799us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 8.44% 228.015us 8.44% 228.015us 4.750us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.20% 5.370us 0.20% 5.370us 5.370us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 956.029us 320.35% 956.029us 956.029us 1 + torch_eager 10.28% 306.488us 99.82% 2.977ms 2.977ms 0.000us 0.00% 316.194us 316.194us 1 + aten::mul 5.10% 152.001us 8.95% 266.845us 11.119us 146.560us 49.11% 146.560us 6.107us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.560us 49.11% 146.560us 6.107us 24 + aten::copy_ 3.72% 110.901us 71.64% 2.137ms 118.718us 110.754us 37.11% 128.514us 7.140us 18 + aten::clone 0.97% 28.901us 68.99% 2.058ms 342.957us 0.000us 0.00% 70.944us 11.824us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.570us 19.29% 57.570us 4.797us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.184us 17.82% 53.184us 8.864us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 41.120us 13.78% 41.120us 3.427us 12 + aten::add 1.16% 34.740us 1.93% 57.500us 9.583us 20.641us 6.92% 20.641us 3.440us 6 + aten::sub 1.34% 39.998us 2.18% 65.101us 10.850us 20.479us 6.86% 20.479us 3.413us 6 + Activity Buffer Request 59.58% 1.777ms 59.58% 1.777ms 1.777ms 17.760us 5.95% 17.760us 17.760us 1 + aten::empty_strided 1.05% 31.260us 1.05% 31.260us 5.210us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 6.26% 186.663us 6.26% 186.663us 31.111us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.24% 66.809us 2.82% 84.238us 3.510us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.58% 17.429us 0.58% 17.429us 0.726us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 7.54% 224.919us 7.54% 224.919us 4.686us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.18% 5.469us 0.18% 5.469us 5.469us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.702ms -Self CUDA time total: 296.606us +Self CPU time total: 2.983ms +Self CUDA time total: 298.434us @@ -4834,27 +4616,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 930.130us 525.53% 930.130us 930.130us 1 - torch_eager 19.64% 282.826us 99.65% 1.435ms 1.435ms 0.000us 0.00% 179.836us 179.836us 1 - aten::mul 10.48% 150.844us 18.43% 265.387us 11.058us 94.845us 53.59% 94.845us 3.952us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 94.845us 53.59% 94.845us 3.952us 24 - aten::copy_ 8.38% 120.684us 44.09% 634.887us 35.272us 57.502us 32.49% 60.350us 3.353us 18 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.478us 22.87% 40.478us 3.373us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.641us 13.92% 24.641us 2.053us 12 - aten::clone 1.49% 21.461us 38.48% 554.053us 92.342us 0.000us 0.00% 19.872us 3.312us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 17.024us 9.62% 17.024us 2.837us 6 - aten::sub 2.41% 34.731us 4.09% 58.881us 9.813us 12.353us 6.98% 12.353us 2.059us 6 - aten::add 2.13% 30.662us 3.72% 53.511us 8.919us 12.288us 6.94% 12.288us 2.048us 6 - Activity Buffer Request 15.30% 220.275us 15.30% 220.275us 220.275us 2.848us 1.61% 2.848us 2.848us 1 - aten::empty_strided 2.11% 30.450us 2.11% 30.450us 5.075us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 15.99% 230.296us 15.99% 230.296us 38.383us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.74% 68.240us 6.08% 87.483us 3.645us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.34% 19.243us 1.34% 19.243us 0.802us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.64% 225.174us 15.64% 225.174us 4.691us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 5.110us 0.35% 5.110us 5.110us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 916.392us 515.61% 916.392us 916.392us 1 + torch_eager 19.58% 274.201us 99.60% 1.394ms 1.394ms 0.000us 0.00% 180.610us 180.610us 1 + aten::mul 11.24% 157.371us 18.87% 264.183us 11.008us 95.074us 53.49% 95.074us 3.961us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 95.074us 53.49% 95.074us 3.961us 24 + aten::copy_ 7.77% 108.775us 43.49% 608.863us 33.826us 57.825us 32.54% 60.705us 3.373us 18 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 40.897us 23.01% 40.897us 3.408us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 24.831us 13.97% 24.831us 2.069us 12 + aten::clone 1.40% 19.580us 37.38% 523.368us 87.228us 0.000us 0.00% 19.808us 3.301us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 16.928us 9.52% 16.928us 2.821us 6 + aten::add 2.38% 33.360us 4.00% 56.040us 9.340us 12.416us 6.99% 12.416us 2.069us 6 + aten::sub 2.76% 38.582us 4.39% 61.472us 10.245us 12.415us 6.99% 12.415us 2.069us 6 + Activity Buffer Request 18.14% 253.955us 18.14% 253.955us 253.955us 2.880us 1.62% 2.880us 2.880us 1 + aten::empty_strided 2.13% 29.860us 2.13% 29.860us 4.977us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.38% 187.273us 13.38% 187.273us 31.212us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.53% 63.391us 5.73% 80.293us 3.346us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.21% 16.902us 1.21% 16.902us 0.704us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.09% 211.242us 15.09% 211.242us 4.401us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.40% 5.600us 0.40% 5.600us 5.600us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.440ms -Self CUDA time total: 176.988us +Self CPU time total: 1.400ms +Self CUDA time total: 177.730us @@ -4864,27 +4646,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.347us 313.60% 931.347us 931.347us 1 - torch_eager 20.13% 283.358us 99.65% 1.403ms 1.403ms 0.000us 0.00% 314.679us 314.679us 1 - aten::mul 10.72% 150.883us 18.79% 264.457us 11.019us 145.371us 48.95% 145.371us 6.057us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.371us 48.95% 145.371us 6.057us 24 - aten::copy_ 7.40% 104.164us 42.97% 604.868us 33.604us 110.845us 37.32% 128.541us 7.141us 18 - aten::clone 1.53% 21.600us 37.15% 522.944us 87.157us 0.000us 0.00% 71.357us 11.893us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.184us 19.25% 57.184us 4.765us 12 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.661us 18.07% 53.661us 8.944us 6 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.767us 13.73% 40.767us 3.397us 12 - aten::add 2.28% 32.151us 3.88% 54.682us 9.114us 20.446us 6.88% 20.446us 3.408us 6 - aten::sub 2.39% 33.622us 4.06% 57.171us 9.528us 20.321us 6.84% 20.321us 3.387us 6 - Activity Buffer Request 14.77% 207.975us 14.77% 207.975us 207.975us 17.696us 5.96% 17.696us 17.696us 1 - aten::empty_strided 2.15% 30.270us 2.15% 30.270us 5.045us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 16.22% 228.377us 16.22% 228.377us 38.063us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.75% 66.830us 6.13% 86.290us 3.595us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.38% 19.460us 1.38% 19.460us 0.811us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.91% 224.006us 15.91% 224.006us 4.667us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.35% 4.971us 0.35% 4.971us 4.971us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 934.618us 312.71% 934.618us 934.618us 1 + torch_eager 20.60% 280.895us 99.62% 1.358ms 1.358ms 0.000us 0.00% 316.921us 316.921us 1 + aten::mul 11.57% 157.759us 19.61% 267.373us 11.141us 146.460us 49.00% 146.460us 6.102us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 146.460us 49.00% 146.460us 6.102us 24 + aten::copy_ 8.07% 110.072us 41.19% 561.700us 31.206us 111.966us 37.46% 130.013us 7.223us 18 + aten::clone 1.51% 20.600us 34.77% 474.096us 79.016us 0.000us 0.00% 72.670us 12.112us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 57.343us 19.19% 57.343us 4.779us 12 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 54.623us 18.28% 54.623us 9.104us 6 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 40.448us 13.53% 40.448us 3.371us 12 + aten::add 2.59% 35.260us 4.22% 57.590us 9.598us 20.288us 6.79% 20.288us 3.381us 6 + aten::sub 2.60% 35.410us 4.30% 58.621us 9.770us 20.160us 6.75% 20.160us 3.360us 6 + Activity Buffer Request 14.73% 200.853us 14.73% 200.853us 200.853us 18.047us 6.04% 18.047us 18.047us 1 + aten::empty_strided 2.18% 29.660us 2.18% 29.660us 4.943us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.85% 188.823us 13.85% 188.823us 31.471us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.75% 64.754us 6.01% 81.922us 3.413us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.26% 17.168us 1.26% 17.168us 0.715us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 15.92% 217.107us 15.92% 217.107us 4.523us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.38% 5.180us 0.38% 5.180us 5.180us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.408ms -Self CUDA time total: 296.983us +Self CPU time total: 1.364ms +Self CUDA time total: 298.874us @@ -4894,27 +4676,27 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 931.511us 159.85% 931.511us 931.511us 1 - torch_eager 19.89% 283.237us 99.62% 1.419ms 1.419ms 0.000us 0.00% 606.457us 606.457us 1 - aten::copy_ 7.21% 102.593us 43.52% 619.697us 34.428us 267.708us 45.94% 291.419us 16.190us 18 - aten::mul 10.56% 150.425us 18.55% 264.165us 11.007us 249.406us 42.80% 249.406us 10.392us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 249.406us 42.80% 249.406us 10.392us 24 - aten::clone 1.52% 21.631us 38.04% 541.603us 90.267us 0.000us 0.00% 201.277us 33.546us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 177.566us 30.47% 177.566us 29.594us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 90.142us 15.47% 90.142us 7.512us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.632us 11.26% 65.632us 5.469us 12 - aten::add 2.16% 30.762us 3.77% 53.662us 8.944us 32.832us 5.63% 32.832us 5.472us 6 - aten::sub 2.53% 36.013us 4.23% 60.192us 10.032us 32.800us 5.63% 32.800us 5.467us 6 - Activity Buffer Request 14.90% 212.145us 14.90% 212.145us 212.145us 23.711us 4.07% 23.711us 23.711us 1 - aten::empty_strided 2.14% 30.440us 2.14% 30.440us 5.073us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 16.99% 241.846us 16.99% 241.846us 40.308us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 4.71% 67.093us 6.00% 85.482us 3.562us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 1.29% 18.389us 1.29% 18.389us 0.766us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 15.73% 223.932us 15.73% 223.932us 4.665us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 0.38% 5.360us 0.38% 5.360us 5.360us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 956.919us 161.50% 956.919us 956.919us 1 + torch_eager 21.30% 289.504us 99.57% 1.353ms 1.353ms 0.000us 0.00% 616.281us 616.281us 1 + aten::copy_ 7.84% 106.532us 38.89% 528.548us 29.364us 278.013us 46.92% 301.788us 16.766us 18 + aten::mul 11.95% 162.407us 20.79% 282.469us 11.770us 248.703us 41.97% 248.703us 10.363us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 248.703us 41.97% 248.703us 10.363us 24 + aten::clone 1.53% 20.799us 32.73% 444.735us 74.123us 0.000us 0.00% 210.204us 35.034us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 186.429us 31.46% 186.429us 31.072us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 91.584us 15.46% 91.584us 7.632us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 65.790us 11.10% 65.790us 5.483us 12 + aten::add 2.44% 33.161us 4.08% 55.501us 9.250us 32.927us 5.56% 32.927us 5.488us 6 + aten::sub 2.95% 40.030us 4.74% 64.440us 10.740us 32.863us 5.55% 32.863us 5.477us 6 + Activity Buffer Request 13.07% 177.663us 13.07% 177.663us 177.663us 23.775us 4.01% 23.775us 23.775us 1 + aten::empty_strided 2.15% 29.270us 2.15% 29.270us 4.878us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 13.63% 185.172us 13.63% 185.172us 30.862us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 4.83% 65.662us 6.08% 82.660us 3.444us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 1.25% 16.998us 1.25% 16.998us 0.708us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 16.63% 225.993us 16.63% 225.993us 4.708us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 0.43% 5.780us 0.43% 5.780us 5.780us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.424ms -Self CUDA time total: 582.746us +Self CPU time total: 1.359ms +Self CUDA time total: 592.506us @@ -4924,59 +4706,105 @@ PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 13.84% 306.170us 64.60% 1.429ms 1.429ms 0.000us 0.00% 1.835ms 1.835ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.808ms 102.17% 1.808ms 1.808ms 1 - aten::copy_ 5.17% 114.346us 26.90% 594.995us 33.055us 791.984us 44.77% 858.095us 47.672us 18 - aten::mul 6.78% 150.032us 12.17% 269.044us 11.210us 828.790us 46.85% 828.790us 34.533us 24 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 828.790us 46.85% 828.790us 34.533us 24 - aten::clone 1.04% 23.090us 22.74% 502.934us 83.822us 0.000us 0.00% 626.230us 104.372us 6 - Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 560.119us 31.66% 560.119us 93.353us 6 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 231.865us 13.11% 231.865us 19.322us 12 -void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 148.413us 8.39% 148.413us 12.368us 12 - aten::sub 1.69% 37.309us 2.75% 60.900us 10.150us 90.142us 5.10% 90.142us 15.024us 6 - Activity Buffer Request 8.38% 185.324us 8.38% 185.324us 185.324us 66.111us 3.74% 66.111us 66.111us 1 - aten::add 1.41% 31.181us 2.49% 55.022us 9.170us 58.271us 3.29% 58.271us 9.712us 6 - aten::empty_strided 1.45% 31.982us 1.45% 31.982us 5.330us 0.000us 0.00% 0.000us 0.000us 6 - cudaMemcpyAsync 10.29% 227.584us 10.29% 227.584us 37.931us 0.000us 0.00% 0.000us 0.000us 6 - aten::slice 3.11% 68.695us 3.96% 87.553us 3.648us 0.000us 0.00% 0.000us 0.000us 24 - aten::as_strided 0.85% 18.858us 0.85% 18.858us 0.786us 0.000us 0.00% 0.000us 0.000us 24 - cudaLaunchKernel 10.59% 234.185us 10.59% 234.185us 4.879us 0.000us 0.00% 0.000us 0.000us 48 - cudaDeviceSynchronize 35.40% 782.770us 35.40% 782.770us 782.770us 0.000us 0.00% 0.000us 0.000us 1 + torch_eager 12.69% 276.287us 61.52% 1.340ms 1.340ms 0.000us 0.00% 1.863ms 1.863ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.835ms 102.22% 1.835ms 1.835ms 1 + aten::copy_ 5.01% 109.060us 24.98% 544.137us 30.230us 806.007us 44.89% 873.590us 48.533us 18 + aten::mul 7.11% 154.844us 12.06% 262.604us 10.942us 842.615us 46.93% 842.615us 35.109us 24 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 842.615us 46.93% 842.615us 35.109us 24 + aten::clone 1.01% 22.000us 21.12% 459.916us 76.653us 0.000us 0.00% 622.361us 103.727us 6 + Memcpy DtoD (Device -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 554.778us 30.90% 554.778us 92.463us 6 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 251.229us 13.99% 251.229us 20.936us 12 +void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 146.939us 8.18% 146.939us 12.245us 12 + aten::sub 1.90% 41.421us 3.00% 65.411us 10.902us 88.573us 4.93% 88.573us 14.762us 6 + Activity Buffer Request 8.49% 184.983us 8.49% 184.983us 184.983us 67.583us 3.76% 67.583us 67.583us 1 + aten::add 1.54% 33.561us 2.59% 56.461us 9.410us 58.366us 3.25% 58.366us 9.728us 6 + aten::empty_strided 1.42% 30.960us 1.42% 30.960us 5.160us 0.000us 0.00% 0.000us 0.000us 6 + cudaMemcpyAsync 8.70% 189.543us 8.70% 189.543us 31.591us 0.000us 0.00% 0.000us 0.000us 6 + aten::slice 2.99% 65.113us 3.77% 82.061us 3.419us 0.000us 0.00% 0.000us 0.000us 24 + aten::as_strided 0.78% 16.948us 0.78% 16.948us 0.706us 0.000us 0.00% 0.000us 0.000us 24 + cudaLaunchKernel 9.88% 215.201us 9.88% 215.201us 4.483us 0.000us 0.00% 0.000us 0.000us 48 + cudaDeviceSynchronize 38.48% 838.063us 38.48% 838.063us 838.063us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.212ms -Self CUDA time total: 1.769ms +Self CPU time total: 2.178ms +Self CUDA time total: 1.796ms impl wl p50(ms) ok -torch_eager cuda_B1_S128_H32_D128_R64 0.21 True -torch_eager cuda_B1_S128_H32_D64_R32 0.22 True -torch_eager cuda_B1_S128_H8_D128_R64 0.22 True -torch_eager cuda_B1_S128_H8_D64_R32 0.17 True -torch_eager cuda_B1_S2048_H32_D128_R64 0.22 True -torch_eager cuda_B1_S2048_H32_D64_R32 0.21 True -torch_eager cuda_B1_S2048_H8_D128_R64 0.21 True +torch_eager cuda_B1_S128_H32_D128_R64 0.22 True +torch_eager cuda_B1_S128_H32_D64_R32 0.23 True +torch_eager cuda_B1_S128_H8_D128_R64 0.23 True +torch_eager cuda_B1_S128_H8_D64_R32 0.18 True +torch_eager cuda_B1_S2048_H32_D128_R64 0.23 True +torch_eager cuda_B1_S2048_H32_D64_R32 0.22 True +torch_eager cuda_B1_S2048_H8_D128_R64 0.22 True torch_eager cuda_B1_S2048_H8_D64_R32 0.22 True torch_eager cuda_B1_S512_H32_D128_R64 0.22 True -torch_eager cuda_B1_S512_H32_D64_R32 0.21 True -torch_eager cuda_B1_S512_H8_D128_R64 0.21 True -torch_eager cuda_B1_S512_H8_D64_R32 0.21 True -torch_eager cuda_B2_S128_H32_D128_R64 0.21 True +torch_eager cuda_B1_S512_H32_D64_R32 0.22 True +torch_eager cuda_B1_S512_H8_D128_R64 0.23 True +torch_eager cuda_B1_S512_H8_D64_R32 0.23 True +torch_eager cuda_B2_S128_H32_D128_R64 0.22 True torch_eager cuda_B2_S128_H32_D64_R32 0.22 True -torch_eager cuda_B2_S128_H8_D128_R64 0.21 True +torch_eager cuda_B2_S128_H8_D128_R64 0.22 True torch_eager cuda_B2_S128_H8_D64_R32 0.22 True -torch_eager cuda_B2_S2048_H32_D128_R64 0.64 True +torch_eager cuda_B2_S2048_H32_D128_R64 0.65 True torch_eager cuda_B2_S2048_H32_D64_R32 0.23 True torch_eager cuda_B2_S2048_H8_D128_R64 0.22 True torch_eager cuda_B2_S2048_H8_D64_R32 0.22 True torch_eager cuda_B2_S512_H32_D128_R64 0.22 True -torch_eager cuda_B2_S512_H32_D64_R32 0.22 True -torch_eager cuda_B2_S512_H8_D128_R64 0.21 True -torch_eager cuda_B2_S512_H8_D64_R32 0.21 True +torch_eager cuda_B2_S512_H32_D64_R32 0.23 True +torch_eager cuda_B2_S512_H8_D128_R64 0.22 True +torch_eager cuda_B2_S512_H8_D64_R32 0.23 True▶ UV Install Logs