feat: push full template and build to repo

Files changed (13) hide show

.gitignore +1 -0
README.md +48 -0
build.toml +18 -0
example.py +42 -0
first_kernel_metal/first_kernel.metal +8 -0
first_kernel_metal/first_kernel.mm +64 -0
flake.lock +117 -0
flake.nix +11 -0
tests/__init__.py +0 -0
tests/test_first_kernel.py +21 -0
torch-ext/first_kernel/__init__.py +12 -0
torch-ext/torch_binding.cpp +13 -0
torch-ext/torch_binding.h +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ build

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# first-kernel
+A custom kernel for PyTorch.
+## Installation
+```bash
+pip install drbh/first-kernel
+```
+## Usage
+```python
+import torch
+from first_kernel import first_kernel
+# Create input tensor
+x = torch.randn(1024, 1024, device="cuda")
+# Run kernel
+result = first_kernel(x)
+```
+## Development
+### Building
+```bash
+nix develop
+nix run .#build-and-copy
+```
+### Testing
+```bash
+nix develop .#test
+pytest tests/
+```
+### Test as a `kernels` user
+```bash
+uv run example.py
+```
+## License
+Apache 2.0

build.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[general]
+backends = ["metal"]
+name = "first-kernel"
+version = 1
+[kernel.first_kernel_metal]
+backend = "metal"
+depends = ["torch"]
+src = [
+  "first_kernel_metal/first_kernel.mm",
+  "first_kernel_metal/first_kernel.metal",
+]
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]

example.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+#     "kernels",
+#     "numpy",
+#     "torch",
+# ]
+# ///
+import platform
+from pathlib import Path
+import kernels
+import torch
+# Load the locally built kernel
+kernel = kernels.get_local_kernel(Path("build"), "first_kernel")
+# Select device
+if platform.system() == "Darwin":
+    device = torch.device("mps")
+elif hasattr(torch, "xpu") and torch.xpu.is_available():
+    device = torch.device("xpu")
+elif torch.version.cuda is not None and torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+print(f"Using device: {device}")
+# Create input tensor
+x = torch.tensor([1.0, 2.0, 3.0], device=device)
+print(f"Input:  {x}")
+# Run kernel (adds 1 to each element)
+result = kernel.first_kernel(x)
+print(f"Output: {result}")
+# Verify result
+expected = x + 1.0
+assert torch.allclose(result, expected), "Kernel output doesn't match expected!"
+print("Success!")

first_kernel_metal/first_kernel.metal ADDED Viewed

	@@ -0,0 +1,8 @@

+#include <metal_stdlib>
+using namespace metal;
+kernel void first_kernel_kernel(device const float *input [[buffer(0)]],
+                                              device float *output [[buffer(1)]],
+                                              uint index [[thread_position_in_grid]]) {
+    output[index] = input[index] + 1.0f;
+}

first_kernel_metal/first_kernel.mm ADDED Viewed

	@@ -0,0 +1,64 @@

+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#ifdef EMBEDDED_METALLIB_HEADER
+#include EMBEDDED_METALLIB_HEADER
+#else
+#error "EMBEDDED_METALLIB_HEADER not defined"
+#endif
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor &tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+void first_kernel(torch::Tensor &out, torch::Tensor const &input) {
+  TORCH_CHECK(input.device().is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float,
+              "first_kernel only supports float32");
+  TORCH_CHECK(input.sizes() == out.sizes(), "Tensors must have same shape");
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+              "Tensors must have same dtype");
+  TORCH_CHECK(input.device() == out.device(),
+              "Tensors must be on same device");
+  @autoreleasepool {
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    int numThreads = input.numel();
+    NSError *error = nil;
+    id<MTLLibrary> library =
+        EMBEDDED_METALLIB_NAMESPACE::createLibrary(device, &error);
+    TORCH_CHECK(library, "Failed to create Metal library: ",
+                error.localizedDescription.UTF8String);
+    id<MTLFunction> func =
+        [library newFunctionWithName:@"first_kernel_kernel"];
+    TORCH_CHECK(func, "Failed to create function");
+    id<MTLComputePipelineState> pso =
+        [device newComputePipelineStateWithFunction:func error:&error];
+    TORCH_CHECK(pso, error.localizedDescription.UTF8String);
+    id<MTLCommandBuffer> cmdBuf = torch::mps::get_command_buffer();
+    dispatch_sync(torch::mps::get_dispatch_queue(), ^() {
+      id<MTLComputeCommandEncoder> encoder = [cmdBuf computeCommandEncoder];
+      [encoder setComputePipelineState:pso];
+      [encoder setBuffer:getMTLBufferStorage(input)
+                  offset:input.storage_offset() * input.element_size()
+                 atIndex:0];
+      [encoder setBuffer:getMTLBufferStorage(out)
+                  offset:out.storage_offset() * out.element_size()
+                 atIndex:1];
+      NSUInteger tgSize =
+          MIN(pso.maxTotalThreadsPerThreadgroup, (NSUInteger)numThreads);
+      [encoder dispatchThreads:MTLSizeMake(numThreads, 1, 1)
+          threadsPerThreadgroup:MTLSizeMake(tgSize, 1, 1)];
+      [encoder endEncoding];
+      torch::mps::commit();
+    });
+  }
+}

flake.lock ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1765121682,
+        "narHash": "sha256-4VBOP18BFeiPkyhy9o4ssBNQEvfvv1kXkasAYd0+rrA=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "65f23138d8d09a92e30f1e5c87611b23ef451bf3",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs",
+        "rust-overlay": "rust-overlay"
+      },
+      "locked": {
+        "lastModified": 1774018498,
+        "narHash": "sha256-enigJmSw6g6e7PjsQ9z8aaMJJaSUVEOpOHsKulWhaSs=",
+        "owner": "huggingface",
+        "repo": "kernels",
+        "rev": "efe2480951107f1880a59cf1b5ae364b5d861566",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernels",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1766341660,
+        "narHash": "sha256-4yG6vx7Dddk9/zh45Y2KM82OaRD4jO3HA9r98ORzysA=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "26861f5606e3e4d1400771b513cc63e5f70151a6",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "rust-overlay": {
+      "inputs": {
+        "nixpkgs": [
+          "kernel-builder",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1769050281,
+        "narHash": "sha256-1H8DN4UZgEUqPUA5ecHOufLZMscJ4IlcGaEftaPtpBY=",
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "rev": "6deef0585c52d9e70f96b6121207e1496d4b0c49",
+        "type": "github"
+      },
+      "original": {
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "type": "github"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernels";
+  };
+  outputs =
+    { self, kernel-builder, ... }:
+    kernel-builder.lib.genKernelFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}

tests/__init__.py ADDED Viewed

File without changes

tests/test_first_kernel.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import platform
+import torch
+import first_kernel
+def test_first_kernel():
+    if platform.system() == "Darwin":
+        device = torch.device("mps")
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        device = torch.device("xpu")
+    elif torch.version.cuda is not None and torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
+    expected = x + 1.0
+    result = first_kernel.first_kernel(x)
+    torch.testing.assert_close(result, expected)

torch-ext/first_kernel/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import Optional
+import torch
+from ._ops import ops
+def first_kernel(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if out is None:
+        out = torch.empty_like(x)
+    ops.first_kernel(out, x)
+    return out

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,13 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("first_kernel(Tensor! out, Tensor input) -> ()");
+#if defined(METAL_KERNEL)
+  ops.impl("first_kernel", torch::kMPS, first_kernel);
+#endif
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <torch/torch.h>
+void first_kernel(torch::Tensor &out, torch::Tensor const &input);