kernels-community
/

residual_rms

Model card Files Files and versions Community

drbh commited on 3 days ago

Commit

79aac9d

0 Parent(s):

feat: impl residual rms kernel repo

Browse files

Files changed (20) hide show

.gitattributes +36 -0
.gitignore +1 -0
README.md +31 -0
build.toml +21 -0
ext-torch/registration.h +27 -0
ext-torch/residual_rms/__init__.py +16 -0
ext-torch/torch_binding.cpp +16 -0
ext-torch/torch_binding.h +12 -0
flake.lock +95 -0
flake.nix +14 -0
residual_rms/compat.h +5 -0
residual_rms/residual_rms_dispatch.cu +56 -0
residual_rms/residual_rms_v0.cu +57 -0
residual_rms/residual_rms_v1.cu +103 -0
residual_rms/residual_rms_v2.cu +118 -0
residual_rms/residual_rms_v3.cu +125 -0
residual_rms/residual_rms_v4.cu +126 -0
test/__init__.py +0 -0
test/kernels/__init__.py +0 -0
test/kernels/test_residual_rms.py +20 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ py_example

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+license: apache-2.0
+---
+## Residual RMS for ROCM
+Residual RMS kernels from [residual_rms](https://github.com/huggingface/hf-rocm-kernels).
+# Development
+This kernel can be built using the the [HF Kernel Builder](https://github.com/huggingface/kernel-builder) using the following the commands.
+## Build
+```bash
+nix build .#bundle -L
+```
+### Dev shell
+```bash
+nix develop -L
+pytest tests
+```
+## Publish
+```bash
+git remote add origin [email protected]:kernels-community/residual_rms
+git push -u origin main
+```

build.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[general]
+version = "0.0.1"
+[torch]
+name = "residual_rms"
+src = [
+  "ext-torch/registration.h",
+  "ext-torch/torch_binding.cpp",
+  "ext-torch/torch_binding.h",
+]
+include = ["."]
+pyroot = "ext-torch"
+pyext = ["py", "json"]
+[kernel.residual_rms]
+capabilities = ["7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0"]
+src = [
+  "residual_rms/residual_rms_dispatch.cu",
+  "residual_rms/compat.h",
+]
+depends = ["torch"]

ext-torch/registration.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+#include <Python.h>
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }

ext-torch/residual_rms/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+try:
+    from ._ops import ops
+except ImportError as e:
+    # Fallback for local development.
+    try:
+        import _residual_rms
+        ops = torch.ops._residual_rms
+    except ImportError:
+        raise e
+def residual_rms(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.residual_rms(out, x)
+    return out

ext-torch/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,16 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // Increment a tensor by 1.
+  ops.def("increment(Tensor x) -> ()");
+  ops.impl("increment", torch::kCUDA, &increment);
+  // Compute the residual root mean square.
+  ops.def("residual_rms(Tensor input, Tensor residual, Tensor weight, Tensor output, float epsilon, float scale, int mode, int num_threads) -> ()");
+  ops.impl("residual_rms", torch::kCUDA, &residual_rms);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

ext-torch/torch_binding.h ADDED Viewed

	@@ -0,0 +1,12 @@

+#pragma once
+#include <optional>
+#include <torch/library.h>
+#include <vector>
+void increment(torch::Tensor &x);
+void residual_rms(torch::Tensor &input, torch::Tensor &residual,
+                  torch::Tensor &weight, torch::Tensor &output, double epsilon,
+                  double scale, int64_t mode, int64_t num_threads);

flake.lock ADDED Viewed

	@@ -0,0 +1,95 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1738315861,
+        "narHash": "sha256-QPWRaIPAMmQANuAOaZIKzh1e69OG8zBWGg+swESEajw=",
+        "ref": "refs/heads/main",
+        "rev": "eabeadcedba5dcef2a562b8f1ed5ec1feb485496",
+        "revCount": 72,
+        "type": "git",
+        "url": "ssh://[email protected]/huggingface/kernel-builder"
+      },
+      "original": {
+        "type": "git",
+        "url": "ssh://[email protected]/huggingface/kernel-builder"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1738247409,
+        "narHash": "sha256-F72dKl9Na6/2N+garOm9qCXPa92GzR8eYSuDra6kbjY=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "358f57074b70e3ee9e1dc118151a4f6f81fcd3bb",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cuda-12.6-for-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  description = "Flake for rocm residual rms kernels";
+  inputs = {
+    kernel-builder.url = "git+ssh://[email protected]/huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs ./.;
+}

residual_rms/compat.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <hip/hip_runtime.h>
+#define WARP_SIZE 32

residual_rms/residual_rms_dispatch.cu ADDED Viewed

	@@ -0,0 +1,56 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <hip/hip_runtime.h>
+#include "op_src/residual_rms/residual_rms_v0.cu"
+#include "op_src/residual_rms/residual_rms_v1.cu"
+#include "op_src/residual_rms/residual_rms_v2.cu"
+#include "op_src/residual_rms/residual_rms_v3.cu"
+#include "op_src/residual_rms/residual_rms_v4.cu"
+void residual_rms(torch::Tensor& input,     // Shape: [m, n] / Layout: row-major / Dtype: fp16
+                  torch::Tensor& residual,  // Shape: [m, n] / Layout: row-major / Dtype: fp16
+                  torch::Tensor& weight,    // Shape: [m,  ] / Layout: row-major / Dtype: fp16
+                  torch::Tensor& output,    // Shape: [m, n] / Layout: row-major / Dtype: fp8
+                  double epsilon, double scale, int64_t mode,
+                  int64_t num_threads) {  // TODO: add fp16 output mode
+    // Retrieve shapes
+    const int rows = input.size(0);
+    const int cols = input.size(1);
+    // Activate device guard
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+    // Prepare kernel launch arguments
+    dim3 grid(rows);
+    dim3 block(num_threads);
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Launch kernel
+    switch (mode) {
+        case 1:
+            LAUNCH_RESIDUAL_RMS_V1;
+            break;
+        case 2:
+            LAUNCH_RESIDUAL_RMS_V2;
+            break;
+        case 3:
+            LAUNCH_RESIDUAL_RMS_V3;
+            break;
+        case 4:
+            LAUNCH_RESIDUAL_RMS_V4;
+            break;
+        default:
+            LAUNCH_RESIDUAL_RMS_V0;
+            break;
+    }
+}
+/*
+    Versions:
+        0. non-vectorized version
+        1. vectorizes loads and stores
+        2. simplified indexing
+        3. added packed conversion
+        4. using packed types everywhere and custom ASM for residual connection and variance
+*/

residual_rms/residual_rms_v0.cu ADDED Viewed

	@@ -0,0 +1,57 @@

+#include <torch/all.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipcub/util_type.hpp>
+#include <hipcub/hipcub.hpp>
+#include <hip/hip_fp8.h>
+#include "utils/macros.h"
+__global__ void _residual_rms_v0(const half* __restrict__ input, half* __restrict__ residual,
+                                 const half* __restrict__ weight, __hip_fp8_storage_t* __restrict__ output,
+                                 const float epsilon, const float scale, const int cols) {
+    // Advance pointers according to the position of the thread in the grid
+    input += blockIdx.x * cols;
+    residual += blockIdx.x * cols;
+    output += blockIdx.x * cols;
+    // Residual connection: inplace add of input to residual, accumulate norm along the way
+    float variance = 0.0f;
+    for (int i = threadIdx.x; i < cols; i += blockDim.x) {
+        half z = input[i];
+        z += residual[i];
+        float x = (float)z;
+        variance += (x * x);
+        residual[i] = z;
+    }
+    variance /= cols;
+    // Block reduce to compute the total norm
+    __shared__ float shared_normalizer;
+    using BlockReduce = hipcub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    variance = BlockReduce(reduceStore).Reduce(variance, hipcub::Sum{}, blockDim.x);
+    if (threadIdx.x == 0) {
+        shared_normalizer = rsqrtf(variance + epsilon);
+    }
+    __syncthreads();
+    // Normalize and convert
+    for (int idx = threadIdx.x; idx < cols; idx += blockDim.x) {
+        float x = (float)residual[idx];
+        half y = (half)(x * shared_normalizer);
+        y = (y * weight[idx]);
+        x = (float)y;
+        x *= scale;
+        FP8_CLAMP(x, float);
+        output[idx] = __hip_cvt_float_to_fp8(x, __HIP_SATFINITE, __HIP_E4M3_FNUZ);
+    }
+}
+#define LAUNCH_RESIDUAL_RMS_V0                                                                                       \
+    (_residual_rms_v0<<<grid, block, 0, stream>>>((half*)input.data_ptr(), (half*)residual.data_ptr(),               \
+                                                  (half*)weight.data_ptr(), (__hip_fp8_storage_t*)output.data_ptr(), \
+                                                  epsilon, scale, cols))

residual_rms/residual_rms_v1.cu ADDED Viewed

	@@ -0,0 +1,103 @@

+#include <torch/all.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipcub/util_type.hpp>
+#include <hipcub/hipcub.hpp>
+#include <hip/hip_fp8.h>
+#include "utils/macros.h"
+#define WPT 8  // WorkPerThreads
+__global__ void _residual_rms_v1(const half* __restrict__ input, half* __restrict__ residual,
+                                 const half* __restrict__ weight, __hip_fp8_storage_t* __restrict__ output,
+                                 const float epsilon, const float scale, const int cols) {
+    // Advance pointers according to the position of the thread in the grid
+    input += blockIdx.x * cols;
+    residual += blockIdx.x * cols;
+    output += blockIdx.x * cols;
+    // Residual connection: inplace add of input to residual, accumulate norm along the way
+    float variance = 0.0f;
+    float fp32_residual;
+    half input_buffer[WPT];
+    half residual_buffer[WPT];
+    for (int i = WPT * threadIdx.x; i < cols; i += WPT * blockDim.x) {
+// Load data using 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            input_buffer[j] = input[i + j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer[j] = residual[i + j];
+        }
+// Add everything in the residual buffer and accumulate variance
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer[j] += input_buffer[j];
+            fp32_residual = (float)residual_buffer[j];
+            variance += fp32_residual * fp32_residual;
+        }
+// 128-bits store
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual[i + j] = residual_buffer[j];
+        }
+    }
+    variance /= cols;
+    // Block reduce to compute the total norm
+    __shared__ float shared_normalizer;
+    using BlockReduce = hipcub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    variance = BlockReduce(reduceStore).Reduce(variance, hipcub::Sum{}, blockDim.x);
+    if (threadIdx.x == 0) {
+        shared_normalizer = rsqrtf(variance + epsilon);
+    }
+    __syncthreads();
+    // Normalize and convert
+    float tmp_float;
+    half residual_buffer_[WPT];
+    half weight_buffer[WPT];
+    __hip_fp8_storage_t fp8_buffer[WPT];
+    for (int i = WPT * threadIdx.x; i < cols; i += WPT * blockDim.x) {
+// 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer_[j] = residual[i + j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            weight_buffer[j] = weight[i + j];
+        }
+// Compute and fill buffer
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            tmp_float = (float)residual_buffer_[j] * shared_normalizer;
+            tmp_float = (float)((half)(tmp_float)*weight_buffer[j]);
+            tmp_float *= scale;
+            FP8_CLAMP(tmp_float, float);
+            fp8_buffer[j] = __hip_cvt_float_to_fp8(tmp_float, __HIP_SATFINITE, __HIP_E4M3_FNUZ);
+        }
+// 64b store
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            output[i + j] = fp8_buffer[j];
+        }
+    }
+}
+#define LAUNCH_RESIDUAL_RMS_V1                                                                                       \
+    (_residual_rms_v1<<<grid, block, 0, stream>>>((half*)input.data_ptr(), (half*)residual.data_ptr(),               \
+                                                  (half*)weight.data_ptr(), (__hip_fp8_storage_t*)output.data_ptr(), \
+                                                  epsilon, scale, cols))

residual_rms/residual_rms_v2.cu ADDED Viewed

	@@ -0,0 +1,118 @@

+#include <torch/all.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipcub/util_type.hpp>
+#include <hipcub/hipcub.hpp>
+#include <hip/hip_fp8.h>
+#include "utils/macros.h"
+#define WPT 8                           // WorkPerThreads
+#define CDIV(a, b) ((a + b - 1) / (b))  // Ceiling division
+__global__ void _residual_rms_v2(const half* __restrict__ input, half* __restrict__ residual,
+                                 const half* __restrict__ weight, __hip_fp8_storage_t* __restrict__ output,
+                                 const float epsilon, const float scale, const int cols) {
+    // Advance pointers according to the position of the thread in the grid
+    input += blockIdx.x * cols + WPT * threadIdx.x;
+    residual += blockIdx.x * cols + WPT * threadIdx.x;
+    weight += WPT * threadIdx.x;
+    output += blockIdx.x * cols + WPT * threadIdx.x;
+    half* residual_start = residual;
+    // Residual connection: inplace add of input to residual, accumulate norm along the way
+    float variance = 0.0f;
+    float fp32_residual;
+    half input_buffer[WPT];
+    half residual_buffer[WPT];
+    const int loop_stride = WPT * blockDim.x;
+    const int iterations = CDIV(cols - WPT * threadIdx.x, loop_stride);
+    for (int i = 0; i < iterations; i++) {
+// Load data using 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            input_buffer[j] = input[j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer[j] = residual[j];
+        }
+// Add everything in the residual buffer and accumulate variance
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer[j] += input_buffer[j];
+            fp32_residual = (float)residual_buffer[j];
+            variance += fp32_residual * fp32_residual;
+        }
+// 128-bits store
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual[j] = residual_buffer[j];
+        }
+        // Advance pointers
+        input += loop_stride;
+        residual += loop_stride;
+    }
+    variance /= cols;
+    // Block reduce to compute the total norm
+    __shared__ float shared_normalizer;
+    using BlockReduce = hipcub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    variance = BlockReduce(reduceStore).Reduce(variance, hipcub::Sum{}, blockDim.x);
+    if (threadIdx.x == 0) {
+        shared_normalizer = rsqrtf(variance + epsilon);
+    }
+    __syncthreads();
+    // Normalize and convert
+    float tmp_float;
+    half residual_buffer_[WPT];
+    half weight_buffer[WPT];
+    __hip_fp8_storage_t fp8_buffer[WPT];
+    residual = residual_start;
+    for (int i = 0; i < iterations; i++) {
+// 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer_[j] = residual[j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            weight_buffer[j] = weight[j];
+        }
+// Compute and fill buffer
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            tmp_float = (float)residual_buffer_[j] * shared_normalizer;
+            tmp_float = (float)((half)(tmp_float)*weight_buffer[j]);
+            tmp_float *= scale;
+            FP8_CLAMP(tmp_float, float);
+            fp8_buffer[j] = __hip_cvt_float_to_fp8(tmp_float, __HIP_SATFINITE, __HIP_E4M3_FNUZ);
+        }
+// 64b store
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            output[j] = fp8_buffer[j];
+        }
+        // Advance pointers
+        residual += loop_stride;
+        weight += loop_stride;
+        output += loop_stride;
+    }
+}
+#define LAUNCH_RESIDUAL_RMS_V2                                                                                       \
+    (_residual_rms_v2<<<grid, block, 0, stream>>>((half*)input.data_ptr(), (half*)residual.data_ptr(),               \
+                                                  (half*)weight.data_ptr(), (__hip_fp8_storage_t*)output.data_ptr(), \
+                                                  epsilon, scale, cols))

residual_rms/residual_rms_v3.cu ADDED Viewed

	@@ -0,0 +1,125 @@

+#include <torch/all.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipcub/util_type.hpp>
+#include <hipcub/hipcub.hpp>
+#include <hip/hip_fp8.h>
+#include "utils/macros.h"
+#define WPT 8                           // WorkPerThreads
+#define CDIV(a, b) ((a + b - 1) / (b))  // Ceiling division
+__global__ void _residual_rms_v3(const half* __restrict__ input, half* __restrict__ residual,
+                                 const half* __restrict__ weight, __hip_fp8x2_storage_t* __restrict__ output,
+                                 const float epsilon, const float scale, const int cols) {
+    // Advance pointers according to the position of the thread in the grid
+    input += blockIdx.x * cols + WPT * threadIdx.x;
+    residual += blockIdx.x * cols + WPT * threadIdx.x;
+    weight += WPT * threadIdx.x;
+    output += (blockIdx.x * cols + WPT * threadIdx.x) / 2;
+    half* residual_start = residual;
+    // Residual connection: inplace add of input to residual, accumulate norm along the way
+    float variance = 0.0f;
+    float fp32_residual;
+    half input_buffer[WPT];
+    half residual_buffer[WPT];
+    const int loop_stride = WPT * blockDim.x;
+    const int iterations = CDIV(cols - WPT * threadIdx.x, loop_stride);
+    for (int i = 0; i < iterations; i++) {
+// Load data using 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            input_buffer[j] = input[j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer[j] = residual[j];
+        }
+// Add everything in the residual buffer and accumulate variance
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer[j] += input_buffer[j];
+            fp32_residual = (float)residual_buffer[j];
+            variance += fp32_residual * fp32_residual;
+        }
+// 128-bits store
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual[j] = residual_buffer[j];
+        }
+        // Advance pointers
+        input += loop_stride;
+        residual += loop_stride;
+    }
+    variance /= cols;
+    // Block reduce to compute the total norm
+    __shared__ float shared_normalizer;
+    using BlockReduce = hipcub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    variance = BlockReduce(reduceStore).Reduce(variance, hipcub::Sum{}, blockDim.x);
+    if (threadIdx.x == 0) {
+        shared_normalizer = rsqrtf(variance + epsilon);
+    }
+    __syncthreads();
+    // Normalize and convert
+    float2 tmp_float2;
+    half residual_buffer_[WPT];
+    half weight_buffer[WPT];
+    __hip_fp8x2_storage_t fp8x2_buffer[WPT / 2];
+    residual = residual_start;
+    for (int i = 0; i < iterations; i++) {
+// 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            residual_buffer_[j] = residual[j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT; j++) {
+            weight_buffer[j] = weight[j];
+        }
+// Compute and fill buffer
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            // .x
+            tmp_float2.x = (float)residual_buffer_[2 * j] * shared_normalizer;
+            tmp_float2.x = (float)((half)(tmp_float2.x) * weight_buffer[2 * j]);
+            tmp_float2.x *= scale;
+            FP8_CLAMP(tmp_float2.x, float);
+            // .y
+            tmp_float2.y = (float)residual_buffer_[2 * j + 1] * shared_normalizer;
+            tmp_float2.y = (float)((half)(tmp_float2.y) * weight_buffer[2 * j + 1]);
+            tmp_float2.y *= scale;
+            FP8_CLAMP(tmp_float2.y, float);
+            // convert
+            fp8x2_buffer[j] = __hip_cvt_float2_to_fp8x2(tmp_float2, __HIP_SATFINITE, __HIP_E4M3_FNUZ);
+        }
+// 64b store
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            output[j] = fp8x2_buffer[j];
+        }
+        // Advance pointers
+        residual += loop_stride;
+        weight += loop_stride;
+        output += loop_stride / 2;
+    }
+}
+#define LAUNCH_RESIDUAL_RMS_V3                                                                                         \
+    (_residual_rms_v3<<<grid, block, 0, stream>>>((half*)input.data_ptr(), (half*)residual.data_ptr(),                 \
+                                                  (half*)weight.data_ptr(), (__hip_fp8x2_storage_t*)output.data_ptr(), \
+                                                  epsilon, scale, cols))

residual_rms/residual_rms_v4.cu ADDED Viewed

	@@ -0,0 +1,126 @@

+#include <torch/all.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipcub/util_type.hpp>
+#include <hipcub/hipcub.hpp>
+#include <hip/hip_fp8.h>
+#include "utils/macros.h"
+#define WPT 8                           // WorkPerThreads
+#define CDIV(a, b) ((a + b - 1) / (b))  // Ceiling division
+__global__ void _residual_rms_v4(const __half2* __restrict__ input, __half2* __restrict__ residual,
+                                 const __half2* __restrict__ weight, __hip_fp8x2_storage_t* __restrict__ output,
+                                 const float epsilon, const float scale, const int cols) {
+    // Advance pointers according to the position of the thread in the grid
+    input += (blockIdx.x * cols + WPT * threadIdx.x) / 2;
+    residual += (blockIdx.x * cols + WPT * threadIdx.x) / 2;
+    weight += (WPT * threadIdx.x) / 2;
+    output += (blockIdx.x * cols + WPT * threadIdx.x) / 2;
+    // Residual connection: inplace add of input to residual, accumulate norm along the way
+    float variance = 0.0f;
+    float fp32_residual;
+    __half2 input_buffer[WPT / 2];
+    __half2 residual_buffer[WPT / 2];
+    const int loop_stride = blockDim.x * (WPT / 2);
+    const int iterations = CDIV(cols - WPT * threadIdx.x, 2 * loop_stride);
+    for (int i = 0; i < iterations; i++) {
+// Load data using 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            input_buffer[j] = input[j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            residual_buffer[j] = residual[j];
+        }
+// Residual connection and variance accumulation
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            asm volatile(
+                "V_PK_ADD_F16 %0, %2, %3\n\t"
+                "V_DOT2C_F32_F16 %1, %2, %2"
+                : "=v"(residual_buffer[j]), "=v"(variance)
+                : "0"(residual_buffer[j]), "v"(input_buffer[j]));
+        }
+// 128-bits store
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            residual[j] = residual_buffer[j];
+        }
+        // Advance pointers
+        input += loop_stride;
+        residual += loop_stride;
+    }
+    variance /= cols;
+    // Block reduce to compute the total norm
+    __shared__ float shared_normalizer;
+    using BlockReduce = hipcub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    variance = BlockReduce(reduceStore).Reduce(variance, hipcub::Sum{}, blockDim.x);
+    if (threadIdx.x == 0) {
+        shared_normalizer = rsqrtf(variance + epsilon);
+    }
+    __syncthreads();
+    // Normalize and convert
+    float2 tmp_float2;
+    __half2 residual_buffer_[WPT / 2];
+    __half2 weight_buffer[WPT / 2];
+    __hip_fp8x2_storage_t fp8x2_buffer[WPT / 2];
+    residual -= iterations * loop_stride;
+    for (int i = 0; i < iterations; i++) {
+// 128-bits loads
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            residual_buffer_[j] = residual[j];
+        }
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            weight_buffer[j] = weight[j];
+        }
+// Compute and fill buffer
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            // .x
+            tmp_float2.x = (float)residual_buffer_[j].x * shared_normalizer;
+            tmp_float2.x = (float)((half)(tmp_float2.x) * weight_buffer[j].x);
+            tmp_float2.x *= scale;
+            FP8_CLAMP(tmp_float2.x, float);
+            // .y
+            tmp_float2.y = (float)residual_buffer_[j].y * shared_normalizer;
+            tmp_float2.y = (float)((half)(tmp_float2.y) * weight_buffer[j].y);
+            tmp_float2.y *= scale;
+            FP8_CLAMP(tmp_float2.y, float);
+            // convert
+            fp8x2_buffer[j] = __hip_cvt_float2_to_fp8x2(tmp_float2, __HIP_SATFINITE, __HIP_E4M3_FNUZ);
+        }
+// 64b store
+#pragma unroll
+        for (int j = 0; j < WPT / 2; j++) {
+            output[j] = fp8x2_buffer[j];
+        }
+        // Advance pointers
+        residual += loop_stride;
+        weight += loop_stride;
+        output += loop_stride;
+    }
+}
+#define LAUNCH_RESIDUAL_RMS_V4                                                                               \
+    (_residual_rms_v4<<<grid, block, 0, stream>>>((__half2*)input.data_ptr(), (__half2*)residual.data_ptr(), \
+                                                  (__half2*)weight.data_ptr(),                               \
+                                                  (__hip_fp8x2_storage_t*)output.data_ptr(), epsilon, scale, cols))

test/__init__.py ADDED Viewed

File without changes

test/kernels/__init__.py ADDED Viewed

File without changes

test/kernels/test_residual_rms.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Tests for the `residual_rms` kernel.
+Run `pytest tests/kernels/test_residual_rms.py`.
+"""
+from typing import List
+import pytest
+import torch
+from residual_rms._ops import ops
+from residual_rms.residual_rms import residual_rms
+@pytest.mark.parametrize("shape", [(2, 3, 4, 5), (2, 3, 4, 5, 6)])
+def test_residual_rms(shape: List[int]) -> None:
+    x = torch.randn(shape)
+    out = torch.zeros_like(x)
+    residual_rms(out, x)
+    assert torch.allclose(out, ops.residual_rms(x))