|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "aclnn_ops.h" |
|
|
|
#include <aclnnop/aclnn_addcdiv.h> |
|
#include <aclnnop/aclnn_avgpool2d.h> |
|
#include <aclnnop/aclnn_batch_matmul.h> |
|
#include <aclnnop/aclnn_cast.h> |
|
#include <aclnnop/aclnn_constant_pad_nd.h> |
|
#include <aclnnop/aclnn_copy.h> |
|
#include <aclnnop/aclnn_cos.h> |
|
#include <aclnnop/aclnn_div.h> |
|
#include <aclnnop/aclnn_exp.h> |
|
#include <aclnnop/aclnn_fill_scalar.h> |
|
#include <aclnnop/aclnn_group_norm.h> |
|
#include <aclnnop/aclnn_index_fill_tensor.h> |
|
#include <aclnnop/aclnn_layer_norm.h> |
|
#include <aclnnop/aclnn_matmul.h> |
|
#include <aclnnop/aclnn_max_pool.h> |
|
#include <aclnnop/aclnn_mm.h> |
|
#include <aclnnop/aclnn_permute.h> |
|
#include <aclnnop/aclnn_pow_tensor_tensor.h> |
|
#include <aclnnop/aclnn_reduce_sum.h> |
|
#include <aclnnop/aclnn_repeat.h> |
|
#include <aclnnop/aclnn_repeat_interleave.h> |
|
#include <aclnnop/aclnn_roll.h> |
|
#include <aclnnop/aclnn_sin.h> |
|
#include <aclnnop/aclnn_softmax.h> |
|
#include <aclnnop/aclnn_tril.h> |
|
#include <aclnnop/aclnn_triu.h> |
|
#include <aclnnop/aclnn_upsample_nearest_2d.h> |
|
#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h> |
|
#include <float.h> |
|
|
|
#include <cmath> |
|
#include <cstring> |
|
#include <exception> |
|
#include <vector> |
|
|
|
#include "ggml-impl.h" |
|
#include "kernels/ascendc_kernels.h" |
|
|
|
#define GGML_COMMON_DECL_C |
|
|
|
#include "../ggml-common.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst, int64_t* repeat_array) { |
|
|
|
aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst, |
|
&workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
|
|
|
|
|
|
|
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
ACL_CHECK( |
|
aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
ACL_CHECK(aclDestroyIntArray(repeats)); |
|
} |
|
|
|
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
GGML_ASSERT(ggml_can_repeat(src, dst)); |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], |
|
dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]}; |
|
|
|
aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, |
|
aclTensor* acl_src1, aclTensor* acl_dst) { |
|
aclScalar* alpha = nullptr; |
|
float alphaValue = 1.0f; |
|
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyScalar(alpha)); |
|
} |
|
|
|
void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); |
|
|
|
aclTensor* acl_src0; |
|
aclTensor* acl_src1; |
|
aclTensor* acl_dst; |
|
|
|
|
|
if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) { |
|
BCAST_SHAPE(src0, src1) |
|
acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0)); |
|
acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1)); |
|
acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0)); |
|
} else { |
|
acl_src0 = ggml_cann_create_tensor(src0); |
|
acl_src1 = ggml_cann_create_tensor(src1); |
|
acl_dst = ggml_cann_create_tensor(dst); |
|
} |
|
|
|
aclnn_add(ctx, acl_src0, acl_src1, acl_dst); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src0)); |
|
ACL_CHECK(aclDestroyTensor(acl_src1)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
GGML_ASSERT(src->type == GGML_TYPE_F32); |
|
GGML_ASSERT(dst->type == GGML_TYPE_F32); |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
float negative_slope; |
|
memcpy(&negative_slope, dst->op_params, sizeof(float)); |
|
aclScalar* acl_negative_slope = |
|
aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnLeakyReluGetWorkspaceSize( |
|
acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyScalar(acl_negative_slope)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_concat(ggml_backend_cann_context& ctx, |
|
aclTensorList* tensorList, aclTensor* acl_dst, |
|
int64_t concat_dim) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
aclTensor* acl_src0 = ggml_cann_create_tensor(src0); |
|
aclTensor* acl_src1 = ggml_cann_create_tensor(src1); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
const int32_t dim = ggml_get_op_params_i32(dst, 0); |
|
|
|
GGML_ASSERT(dim >= 0 && dim < 4); |
|
int32_t acl_dim = 3 - dim; |
|
|
|
aclTensor* tensors[] = {acl_src0, acl_src1}; |
|
aclTensorList* tensorList = aclCreateTensorList(tensors, 2); |
|
aclnn_concat(ctx, tensorList, acl_dst, acl_dim); |
|
|
|
ACL_CHECK(aclDestroyTensorList(tensorList)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, |
|
float start, float stop, float step, |
|
int64_t n_elements) { |
|
int64_t steps = (int64_t)std::ceil((stop - start) / step); |
|
GGML_ASSERT(n_elements == steps); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT); |
|
aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT); |
|
aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT); |
|
|
|
ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyScalar(acl_start)); |
|
ACL_CHECK(aclDestroyScalar(acl_end)); |
|
ACL_CHECK(aclDestroyScalar(acl_step)); |
|
} |
|
|
|
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
GGML_ASSERT(dst->type == GGML_TYPE_F32); |
|
|
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
int64_t n_elements = ggml_nelements(dst); |
|
float start; |
|
float stop; |
|
float step; |
|
memcpy(&start, (float*)dst->op_params + 0, sizeof(float)); |
|
memcpy(&stop, (float*)dst->op_params + 1, sizeof(float)); |
|
memcpy(&step, (float*)dst->op_params + 2, sizeof(float)); |
|
|
|
aclnn_arange(ctx, acl_dst, start, stop, step, n_elements); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
dst->src[1] = dst->src[0]; |
|
ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst); |
|
} |
|
|
|
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
GGML_ASSERT(src->type == GGML_TYPE_F32); |
|
GGML_ASSERT(dst->type == GGML_TYPE_F32); |
|
|
|
float min; |
|
float max; |
|
memcpy(&min, dst->op_params, sizeof(float)); |
|
memcpy(&max, (float*)dst->op_params + 1, sizeof(float)); |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT); |
|
aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyScalar(acl_min)); |
|
ACL_CHECK(aclDestroyScalar(acl_max)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
|
|
float v; |
|
memcpy(&v, dst->op_params, sizeof(float)); |
|
|
|
aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT); |
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize, |
|
&executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyScalar(scale)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0]; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
ggml_cann_pool_alloc temp_buffer_allocator( |
|
ctx.pool(), ggml_nelements(dst) * sizeof(int64_t)); |
|
void* buffer = temp_buffer_allocator.get(); |
|
aclTensor* tmp_tensor = |
|
ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), |
|
dst->ne, dst->nb, GGML_MAX_DIMS); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnArgsortGetWorkspaceSize( |
|
acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
workspaceSize = 0; |
|
ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, |
|
ggml_cann_type_mapping(dst->type), |
|
acl_dst, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(tmp_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
float eps; |
|
memcpy(&eps, dst->op_params, sizeof(float)); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
std::vector<int64_t> normData = {dst->ne[0]}; |
|
aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size()); |
|
ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, |
|
eps, acl_dst, nullptr, nullptr, |
|
&workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyIntArray(norm)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
int n_groups = dst->op_params[0]; |
|
|
|
float eps; |
|
memcpy(&eps, dst->op_params + 1, sizeof(float)); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
int64_t N = src->ne[3]; |
|
int64_t C = src->ne[2]; |
|
int64_t HxW = src->ne[1] * src->ne[0]; |
|
|
|
size_t type_size = ggml_type_size(src->type); |
|
int64_t ne[] = {n_groups, N}; |
|
size_t nb[] = {type_size, type_size * n_groups}; |
|
size_t n_bytes = N * n_groups; |
|
|
|
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2); |
|
void* buffer = temp_buffer_allocator.get(); |
|
aclTensor* acl_mean_out = ggml_cann_create_tensor( |
|
buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); |
|
aclTensor* acl_rstd_out = ggml_cann_create_tensor( |
|
(char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); |
|
|
|
ACL_CHECK(aclnnGroupNormGetWorkspaceSize( |
|
acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst, |
|
acl_mean_out, acl_rstd_out, &workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
ACL_CHECK(aclDestroyTensor(acl_mean_out)); |
|
ACL_CHECK(aclDestroyTensor(acl_rstd_out)); |
|
} |
|
|
|
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
|
|
size_t nb1 = ((int32_t*)dst->op_params)[0]; |
|
size_t nb2 = ((int32_t*)dst->op_params)[1]; |
|
size_t nb3 = ((int32_t*)dst->op_params)[2]; |
|
size_t offset = ((int32_t*)dst->op_params)[3]; |
|
bool inplace = (bool)((int32_t*)dst->op_params)[4]; |
|
|
|
size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3}; |
|
|
|
aclTensor* acl_dst = ggml_cann_create_tensor( |
|
dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); |
|
aclTensor* acl_src1 = ggml_cann_create_tensor(src1); |
|
|
|
aclScalar* alpha = nullptr; |
|
float alphaValue = 1.0f; |
|
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
if (!inplace) { |
|
size_t cpy_size = ggml_nbytes(dst); |
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, |
|
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); |
|
aclTensor* acl_src0 = ggml_cann_create_tensor( |
|
src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); |
|
ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
ACL_CHECK( |
|
aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
ACL_CHECK(aclDestroyTensor(acl_src0)); |
|
} else { |
|
ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, |
|
ctx.stream())); |
|
} |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src1)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
|
|
GGML_ASSERT(dst->ne[0] == 1); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
int64_t reduce_dims_host[] = {3}; |
|
aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnReduceSumGetWorkspaceSize( |
|
acl_src, reduce_dims, true, ggml_cann_type_mapping(src->type), acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, |
|
ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
aclTensor* acl_src = |
|
ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); |
|
aclTensor* acl_dst = |
|
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); |
|
|
|
std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]}; |
|
auto output_size_array = aclCreateIntArray(output_size.data(), 2); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize( |
|
acl_src, output_size_array, acl_dst, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor, |
|
ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyIntArray(output_size_array)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst, int64_t* paddings, |
|
float value = 0.0f) { |
|
aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2); |
|
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize( |
|
acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor, |
|
ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyIntArray(acl_pad)); |
|
ACL_CHECK(aclDestroyScalar(acl_value)); |
|
} |
|
|
|
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
|
|
|
|
|
|
|
|
int64_t paddings[] = { |
|
0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1], |
|
0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]}; |
|
aclnn_pad(ctx, acl_src, acl_dst, paddings); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, |
|
ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
GGML_ASSERT(src->type == GGML_TYPE_F32); |
|
GGML_ASSERT(dst->type == GGML_TYPE_F32); |
|
|
|
aclTensor* acl_src = |
|
ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); |
|
aclTensor* acl_dst = |
|
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); |
|
|
|
const int32_t* opts = (const int32_t*)dst->op_params; |
|
const int k0 = opts[1]; |
|
const int k1 = opts[2]; |
|
const int s0 = opts[3]; |
|
const int s1 = opts[4]; |
|
const int p0 = opts[5]; |
|
const int p1 = opts[6]; |
|
|
|
std::vector<int64_t> kernel_dims = {k1, k0}; |
|
std::vector<int64_t> stride_dims = {s1, s0}; |
|
std::vector<int64_t> padding_avg_dims = {p1, p0}; |
|
|
|
auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); |
|
auto* strides = aclCreateIntArray(stride_dims.data(), 2); |
|
auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2); |
|
|
|
bool ceil_mode = false; |
|
bool count_include_pad = true; |
|
int64_t divisor_override = 0; |
|
int8_t cube_math_type = 0; |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize( |
|
acl_src, kernel_size, strides, paddings_avg, ceil_mode, |
|
count_include_pad, divisor_override, cube_math_type, acl_dst, |
|
&workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
ACL_CHECK( |
|
aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
ACL_CHECK(aclDestroyIntArray(kernel_size)); |
|
ACL_CHECK(aclDestroyIntArray(strides)); |
|
ACL_CHECK(aclDestroyIntArray(paddings_avg)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, |
|
ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
GGML_ASSERT(src->type == GGML_TYPE_F32); |
|
GGML_ASSERT(dst->type == GGML_TYPE_F32); |
|
|
|
aclTensor* acl_src = |
|
ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); |
|
aclTensor* acl_dst = |
|
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); |
|
|
|
const int32_t* opts = (const int32_t*)dst->op_params; |
|
const int k0 = opts[1]; |
|
const int k1 = opts[2]; |
|
const int s0 = opts[3]; |
|
const int s1 = opts[4]; |
|
const int p0 = opts[5]; |
|
const int p1 = opts[6]; |
|
|
|
int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], |
|
src->ne[3]}; |
|
size_t temp_nb[GGML_MAX_DIMS]; |
|
|
|
temp_nb[0] = ggml_element_size(src); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1]; |
|
} |
|
|
|
ggml_cann_pool_alloc temp_buffer_allocator( |
|
ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); |
|
void* buffer = temp_buffer_allocator.get(); |
|
aclTensor* tmp_tensor = ggml_cann_create_tensor( |
|
buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb, |
|
GGML_MAX_DIMS, ACL_FORMAT_NCHW); |
|
|
|
|
|
int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0}; |
|
float value = -FLT_MAX; |
|
aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value); |
|
|
|
|
|
std::vector<int64_t> kernel_dims = {k1, k0}; |
|
std::vector<int64_t> stride_dims = {s1, s0}; |
|
|
|
std::vector<int64_t> padding_max_dims = {0, 0, 0, 0}; |
|
std::vector<int64_t> dilation_size = {1, 1}; |
|
auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); |
|
auto* strides = aclCreateIntArray(stride_dims.data(), 2); |
|
auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4); |
|
auto* dilations = aclCreateIntArray(dilation_size.data(), 2); |
|
|
|
bool ceil_mode = false; |
|
int64_t auto_pads = 0; |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnMaxPoolGetWorkspaceSize( |
|
tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations, |
|
ceil_mode, acl_dst, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
ACL_CHECK(aclDestroyTensor(tmp_tensor)); |
|
ACL_CHECK(aclDestroyIntArray(kernel_size)); |
|
ACL_CHECK(aclDestroyIntArray(strides)); |
|
ACL_CHECK(aclDestroyIntArray(paddings_max)); |
|
ACL_CHECK(aclDestroyIntArray(dilations)); |
|
} |
|
|
|
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
const int32_t* opts = (const int32_t*)dst->op_params; |
|
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]); |
|
switch (op) { |
|
case GGML_OP_POOL_AVG: |
|
ggml_cann_avg_pool2d(ctx, dst); |
|
break; |
|
case GGML_OP_POOL_MAX: |
|
ggml_cann_max_pool2d(ctx, dst); |
|
break; |
|
case GGML_OP_POOL_COUNT: |
|
GGML_ABORT("fatal error"); |
|
break; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize, |
|
&executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); |
|
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); |
|
src->extra = src_extra_allocator.get(); |
|
dst->extra = dst_extra_allocator.get(); |
|
ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src, |
|
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, |
|
ctx.stream())); |
|
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, |
|
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, |
|
ctx.stream())); |
|
|
|
if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) && |
|
ggml_are_same_shape(src, dst)) { |
|
cann_copy(ctx, acl_src, acl_dst); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
return; |
|
} |
|
|
|
if (src->type == GGML_TYPE_F16) { |
|
if (dst->type == GGML_TYPE_Q8_0) { |
|
aclrtlaunch_ascendc_quantize_f16_q8_0( |
|
24, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne); |
|
return; |
|
} |
|
if (dst->type == GGML_TYPE_Q4_0) { |
|
aclrtlaunch_ascendc_quantize_f16_to_q4_0( |
|
24, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne); |
|
return; |
|
} |
|
if (dst->type == GGML_TYPE_F16) { |
|
if (ggml_are_same_shape(src, dst)) { |
|
cann_copy(ctx, acl_src, acl_dst); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
return; |
|
} |
|
if (ggml_is_contiguous(dst)) { |
|
const size_t src_type_size = ggml_type_size(src->type); |
|
if (src->nb[0] == src_type_size) { |
|
|
|
int64_t rows_num = ggml_nrows(src); |
|
|
|
aclrtlaunch_ascendc_dup_by_rows_fp16( |
|
rows_num, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, |
|
((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
return; |
|
} |
|
GGML_ABORT("fatal error"); |
|
} |
|
GGML_ABORT("fatal error"); |
|
} |
|
if (dst->type == GGML_TYPE_F32) { |
|
if (ggml_are_same_shape(src, dst)) { |
|
cann_copy(ctx, acl_src, acl_dst); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
return; |
|
} |
|
if (ggml_is_contiguous(dst)) { |
|
const size_t src_type_size = ggml_type_size(src->type); |
|
if (src->nb[0] == src_type_size) { |
|
|
|
int64_t rows_num = ggml_nrows(src); |
|
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32( |
|
rows_num, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, |
|
((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
return; |
|
} |
|
GGML_ABORT("fatal error"); |
|
} |
|
GGML_ABORT("fatal error"); |
|
} |
|
|
|
GGML_ABORT("fatal error"); |
|
} else if (src->type == GGML_TYPE_F32) { |
|
|
|
|
|
if (dst->type == GGML_TYPE_Q8_0) { |
|
aclrtlaunch_ascendc_quantize_f32_q8_0( |
|
24, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne); |
|
return; |
|
} |
|
if (dst->type == GGML_TYPE_Q4_0) { |
|
aclrtlaunch_ascendc_quantize_f32_to_q4_0( |
|
24, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne); |
|
return; |
|
} |
|
if (dst->type == GGML_TYPE_F32) { |
|
if (ggml_are_same_shape(src, dst)) { |
|
cann_copy(ctx, acl_src, acl_dst); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
return; |
|
} |
|
if (ggml_is_contiguous(dst)) { |
|
const size_t src_type_size = ggml_type_size(src->type); |
|
if (src->nb[0] == src_type_size) { |
|
|
|
int64_t rows_num = ggml_nrows(src); |
|
aclrtlaunch_ascendc_dup_by_rows_fp32( |
|
rows_num, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, |
|
((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
return; |
|
} |
|
GGML_ABORT("fatal error"); |
|
} else { |
|
|
|
GGML_ABORT("fatal error"); |
|
} |
|
} |
|
if (dst->type == GGML_TYPE_F16) { |
|
if (ggml_are_same_shape(src, dst)) { |
|
cann_copy(ctx, acl_src, acl_dst); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
return; |
|
} |
|
if (ggml_is_contiguous(dst)) { |
|
const size_t src_type_size = ggml_type_size(src->type); |
|
if (src->nb[0] == src_type_size) { |
|
|
|
int64_t rows_num = ggml_nrows(src); |
|
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16( |
|
rows_num, ctx.stream(), src->data, dst->data, |
|
((ggml_tensor*)src->extra)->ne, |
|
((ggml_tensor*)src->extra)->nb, |
|
((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
return; |
|
} |
|
GGML_ABORT("fatal error"); |
|
} |
|
} |
|
|
|
GGML_ABORT("fatal error"); |
|
} else { |
|
if (ggml_are_same_shape(src, dst)) { |
|
cann_copy(ctx, acl_src, acl_dst); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
return; |
|
} |
|
GGML_ABORT("fatal error"); |
|
} |
|
} |
|
|
|
#ifdef __cplusplus |
|
extern "C" { |
|
#endif |
|
aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x, |
|
const aclTensor* gamma, double epsilon, |
|
const aclTensor* yOut, |
|
const aclTensor* rstdOout, |
|
uint64_t* workspaceSize, |
|
aclOpExecutor** executor); |
|
aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize, |
|
aclOpExecutor* executor, aclrtStream stream); |
|
#ifdef __cplusplus |
|
} |
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, |
|
size_t n_bytes, int64_t* ne, int64_t dims, |
|
aclDataType type, size_t type_size) { |
|
size_t nb[GGML_MAX_DIMS]; |
|
nb[0] = type_size; |
|
for (int i = 1; i < dims; i++) { |
|
nb[i] = nb[i - 1] * ne[i - 1]; |
|
} |
|
|
|
ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream())); |
|
aclTensor* zero = |
|
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims); |
|
return zero; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer, |
|
size_t n_bytes, int64_t* ne, int64_t dims, |
|
aclDataType type, size_t type_size, |
|
float value = 1.0f) { |
|
aclTensor* acl_tensor = |
|
aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); |
|
float alpha_host = 1.0f; |
|
aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT); |
|
aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha, |
|
&workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
ACL_CHECK( |
|
aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
return acl_tensor; |
|
} |
|
|
|
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
float eps; |
|
memcpy(&eps, dst->op_params, sizeof(float)); |
|
|
|
GGML_ASSERT(eps > 0.0f); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); |
|
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); |
|
|
|
aclTensor* acl_gamma = aclnn_values( |
|
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, |
|
ggml_cann_type_mapping(src->type), ggml_element_size(src)); |
|
|
|
size_t zero_tensor_n_bytes = |
|
src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src); |
|
ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes); |
|
aclTensor* acl_rstd = |
|
aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, |
|
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), |
|
ggml_element_size(src)); |
|
|
|
ACL_CHECK(aclnnRmsNormGetWorkspaceSize( |
|
acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
ACL_CHECK(aclDestroyTensor(acl_gamma)); |
|
ACL_CHECK(aclDestroyTensor(acl_rstd)); |
|
} |
|
|
|
|
|
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, |
|
float value) { |
|
ggml_tensor* src = dst->src[0]; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
const int n_past = ((int32_t*)dst->op_params)[0]; |
|
|
|
size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] * |
|
src->ne[3] * ggml_element_size(src); |
|
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); |
|
|
|
aclTensor* mask_tensor = |
|
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, |
|
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), |
|
ggml_element_size(src), value); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
aclScalar* alpha = nullptr; |
|
float alphaValue = 1.0f; |
|
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); |
|
|
|
ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
ACL_CHECK( |
|
aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyScalar(alpha)); |
|
ACL_CHECK(aclDestroyTensor(mask_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst, aclDataType cast_data_type) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) { |
|
aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyIntArray(acl_dims)); |
|
} |
|
|
|
#ifdef __cplusplus |
|
extern "C" { |
|
#endif |
|
aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self, |
|
const aclIntArray* kernelSize, |
|
const aclIntArray* dilation, |
|
const aclIntArray* padding, |
|
const aclIntArray* stride, |
|
aclTensor* out, uint64_t* workspaceSize, |
|
aclOpExecutor** executor); |
|
aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize, |
|
aclOpExecutor* executor, aclrtStream stream); |
|
#ifdef __cplusplus |
|
} |
|
#endif |
|
|
|
static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, |
|
ggml_tensor* dst, |
|
ggml_tensor* src1, |
|
aclTensor* tmp_cast_tensor, |
|
aclTensor* tmp_im2col_tensor) { |
|
|
|
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; |
|
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; |
|
aclTensor* acl_dst = |
|
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); |
|
|
|
int64_t permute_dim[] = {0, 2, 1}; |
|
if (src1->type != dst->type) { |
|
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); |
|
} else { |
|
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); |
|
} |
|
|
|
|
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
static void ggml_cann_im2col_1d_post_process( |
|
ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1, |
|
aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor, |
|
const std::vector<int64_t>& im2col_op_params) { |
|
|
|
const int64_t KH = im2col_op_params[0]; |
|
const int64_t KW = im2col_op_params[1]; |
|
const int64_t IW = im2col_op_params[2]; |
|
const int64_t IC = im2col_op_params[3]; |
|
const int64_t N = im2col_op_params[4]; |
|
const int64_t OH = im2col_op_params[5]; |
|
const int64_t OW = im2col_op_params[6]; |
|
const int64_t s0 = im2col_op_params[7]; |
|
const int64_t p0 = im2col_op_params[8]; |
|
const int64_t d0 = im2col_op_params[9]; |
|
const int64_t n_bytes_factor = im2col_op_params[10]; |
|
|
|
|
|
|
|
aclTensor* tmp_permute_tensor = nullptr; |
|
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool()); |
|
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); |
|
void* tmp_permute_buffer = tmp_permute_allocator.get(); |
|
|
|
int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N}; |
|
size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; |
|
tmp_permute_nb[0] = ggml_type_size(dst->type); |
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { |
|
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; |
|
} |
|
|
|
tmp_permute_tensor = ggml_cann_create_tensor( |
|
tmp_permute_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb, |
|
GGML_MAX_DIMS - 1, ACL_FORMAT_ND); |
|
|
|
int64_t permute_dim[] = {0, 2, 1}; |
|
if (src1->type != dst->type) { |
|
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3); |
|
} else { |
|
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, |
|
3); |
|
} |
|
|
|
|
|
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1; |
|
size_t offset; |
|
void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; |
|
|
|
|
|
if (IC > 1) { |
|
offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); |
|
size_t size_cpy = KH * KW * ggml_type_size(dst->type); |
|
|
|
for (int c = 0; c < IC; c++) { |
|
cur_permute_buffer = (char*)tmp_permute_buffer + offset + |
|
KH * KW * c * ggml_type_size(dst->type); |
|
cur_dst_buffer = (char*)dst->data + |
|
c * KH * KW * n_step_w * ggml_type_size(dst->type); |
|
|
|
for (int i = 0; i < n_step_w; i++) { |
|
ACL_CHECK(aclrtMemcpyAsync( |
|
cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy, |
|
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); |
|
cur_dst_buffer = |
|
(char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type); |
|
cur_permute_buffer = (char*)cur_permute_buffer + |
|
KH * KW * IC * ggml_type_size(dst->type); |
|
} |
|
} |
|
} else { |
|
offset = KH * KW * n_step_w * |
|
ggml_type_size(dst->type); |
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, |
|
(char*)tmp_permute_buffer + offset, offset, |
|
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); |
|
} |
|
|
|
|
|
ACL_CHECK(aclDestroyTensor(tmp_permute_tensor)); |
|
} |
|
|
|
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS; |
|
|
|
|
|
|
|
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; |
|
const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; |
|
const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1; |
|
const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; |
|
const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1; |
|
const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; |
|
const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1; |
|
|
|
const int64_t N = ne13; |
|
const int64_t IC = ne12; |
|
const int64_t KH = ne01; |
|
const int64_t KW = ne00; |
|
const int64_t IW = ne10; |
|
|
|
const int64_t OH = is_2D ? ne2 : 1; |
|
const int64_t OW = ne1; |
|
|
|
|
|
const int64_t n_bytes_factor = is_2D ? 1 : 3; |
|
|
|
|
|
aclTensor* acl_src1 = ggml_cann_create_tensor(src1); |
|
int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N}; |
|
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; |
|
|
|
tmp_im2col_nb[0] = ggml_type_size(src1->type); |
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { |
|
tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1]; |
|
} |
|
|
|
|
|
|
|
|
|
ggml_cann_pool_alloc im2col_allocator( |
|
ctx.pool(), |
|
ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); |
|
void* tmp_im2col_buffer = im2col_allocator.get(); |
|
|
|
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor( |
|
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), |
|
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, |
|
GGML_MAX_DIMS - 1, ACL_FORMAT_ND); |
|
|
|
std::vector<int64_t> kernel_dims = {KH, KW}; |
|
std::vector<int64_t> dilation_size = {d1, d0}; |
|
std::vector<int64_t> padding_dims = {p1, p0}; |
|
std::vector<int64_t> stride_dims = {s1, s0}; |
|
auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); |
|
auto* dilations = aclCreateIntArray(dilation_size.data(), 2); |
|
auto* paddings = aclCreateIntArray(padding_dims.data(), 2); |
|
auto* strides = aclCreateIntArray(stride_dims.data(), 2); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations, |
|
paddings, strides, tmp_im2col_tensor, |
|
&workspaceSize, &executor)); |
|
|
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool()); |
|
if (workspaceSize > 0) { |
|
workspace_allocator.alloc(workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
|
|
aclTensor* tmp_cast_tensor = nullptr; |
|
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); |
|
void* tmp_cast_buffer = nullptr; |
|
if (src1->type != dst->type) { |
|
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); |
|
tmp_cast_buffer = tmp_cast_allocator.get(); |
|
size_t temp_cast_nb[GGML_MAX_DIMS - 1]; |
|
temp_cast_nb[0] = ggml_type_size(dst->type); |
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { |
|
temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1]; |
|
} |
|
|
|
tmp_cast_tensor = ggml_cann_create_tensor( |
|
tmp_cast_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb, |
|
GGML_MAX_DIMS - 1, ACL_FORMAT_ND); |
|
aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, |
|
ggml_cann_type_mapping(dst->type)); |
|
} |
|
|
|
|
|
if (is_2D) { |
|
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, |
|
tmp_im2col_tensor); |
|
} else { |
|
std::vector<int64_t> im2col_op_params = { |
|
KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor}; |
|
ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, |
|
tmp_im2col_tensor, im2col_op_params); |
|
} |
|
|
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src1)); |
|
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor)); |
|
ACL_CHECK(aclDestroyIntArray(kernel_size)); |
|
ACL_CHECK(aclDestroyIntArray(dilations)); |
|
ACL_CHECK(aclDestroyIntArray(paddings)); |
|
ACL_CHECK(aclDestroyIntArray(strides)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK( |
|
aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
float scale, aclTensor* acl_dst, bool inplace) { |
|
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
if (inplace) { |
|
ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor, |
|
ctx.stream())); |
|
} else { |
|
ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
ACL_CHECK(aclDestroyScalar(acl_scale)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_src, aclTensor* acl_other) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_other, aclTensor* acl_dst) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK( |
|
aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK( |
|
aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_self, aclTensor* tensor1, |
|
aclTensor* tensor2, float value) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); |
|
|
|
ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize( |
|
acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor, |
|
ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_other, aclTensor* acl_dst, |
|
bool inplace) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
if (inplace) { |
|
ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor, |
|
ctx.stream())); |
|
} else { |
|
ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
} |
|
|
|
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, |
|
ggml_tensor* dst) { |
|
const ggml_tensor* src = dst->src[0]; |
|
|
|
GGML_ASSERT(src->type == GGML_TYPE_F32); |
|
GGML_ASSERT(dst->type == GGML_TYPE_F32); |
|
|
|
const int dim = dst->op_params[0]; |
|
const int max_period = dst->op_params[1]; |
|
int half = dim / 2; |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src); |
|
|
|
|
|
float start = 0; |
|
float stop = half; |
|
float step = 1; |
|
int64_t n_elements_arange = half; |
|
int64_t tmp_arange_ne[] = {half}; |
|
size_t tmp_arange_nb[] = {sizeof(dst->type)}; |
|
|
|
ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type)); |
|
void* tmp_arange_buffer = arange_allocator.get(); |
|
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor( |
|
tmp_arange_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb, |
|
GGML_MAX_DIMS - 3, ACL_FORMAT_ND); |
|
|
|
aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange); |
|
|
|
|
|
float freq_param = -logf(max_period) / half; |
|
bool inplace = true; |
|
aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace); |
|
aclnn_exp(ctx, tmp_arange_tensor); |
|
|
|
|
|
int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]}; |
|
size_t tmp_permute_nb[GGML_MAX_DIMS]; |
|
tmp_permute_nb[0] = ggml_type_size(src->type); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; |
|
} |
|
|
|
ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src)); |
|
void* tmp_permute_buffer = permute_allocator.get(); |
|
aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor( |
|
tmp_permute_buffer, ggml_cann_type_mapping(src->type), |
|
ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb, |
|
GGML_MAX_DIMS, ACL_FORMAT_ND); |
|
int64_t permute_dim[] = {0, 1, 3, 2}; |
|
int64_t num_dims = 4; |
|
aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims); |
|
|
|
|
|
int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2], |
|
src->ne[3]}; |
|
size_t tmp_mul_nb[GGML_MAX_DIMS]; |
|
tmp_mul_nb[0] = ggml_type_size(src->type); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1]; |
|
} |
|
|
|
int mul_nelements = |
|
src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3]; |
|
|
|
ggml_cann_pool_alloc mul_allocator( |
|
ctx.pool(), mul_nelements * ggml_type_size(src->type)); |
|
void* tmp_mul_buffer = mul_allocator.get(); |
|
aclTensor* tmp_mul_tensor = ggml_cann_create_tensor( |
|
tmp_mul_buffer, ggml_cann_type_mapping(src->type), |
|
ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, |
|
ACL_FORMAT_ND); |
|
aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor); |
|
|
|
|
|
ggml_cann_pool_alloc cos_allocator( |
|
ctx.pool(), mul_nelements * ggml_type_size(src->type)); |
|
void* tmp_cos_buffer = cos_allocator.get(); |
|
aclTensor* tmp_cos_tensor = ggml_cann_create_tensor( |
|
tmp_cos_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, |
|
ACL_FORMAT_ND); |
|
|
|
aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor); |
|
|
|
|
|
ggml_cann_pool_alloc sin_allocator( |
|
ctx.pool(), mul_nelements * ggml_type_size(src->type)); |
|
void* tmp_sin_buffer = sin_allocator.get(); |
|
aclTensor* tmp_sin_tensor = ggml_cann_create_tensor( |
|
tmp_sin_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, |
|
ACL_FORMAT_ND); |
|
|
|
aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor); |
|
|
|
|
|
int64_t concat_dim = 3; |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor}; |
|
aclTensorList* tensorList = aclCreateTensorList(tensors, 2); |
|
aclnn_concat(ctx, tensorList, acl_dst, concat_dim); |
|
|
|
|
|
|
|
ACL_CHECK(aclDestroyTensorList(tensorList)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(tmp_arange_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr)); |
|
ACL_CHECK(aclDestroyTensor(tmp_mul_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, |
|
aclTensor* acl_dst) { |
|
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize( |
|
acl_dst, acl_scalar, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor, |
|
ctx.stream())); |
|
ACL_CHECK(aclDestroyScalar(acl_scalar)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_dst, aclTensor* acl_exp) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize( |
|
acl_dst, acl_exp, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize, |
|
executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_position, aclTensor* acl_dst, |
|
const int n_head, int64_t* src_ne, const size_t src_nb0, |
|
float max_bias, ggml_tensor* dst) { |
|
const int64_t ne2_ne3 = src_ne[2] * src_ne[3]; |
|
GGML_ASSERT(src_nb0 == sizeof(float)); |
|
GGML_ASSERT(n_head == src_ne[2]); |
|
|
|
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head)); |
|
|
|
float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); |
|
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); |
|
|
|
|
|
ggml_cann_pool_alloc arange_allocator(ctx.pool(), |
|
ne2_ne3 * ggml_type_size(dst->type)); |
|
void* tmp_arange_buffer = arange_allocator.get(); |
|
|
|
|
|
float start = 1; |
|
float stop = n_heads_log2_floor + 1; |
|
float step = 1; |
|
int64_t n_elements_arange = n_heads_log2_floor; |
|
|
|
int64_t tmp_arange1_ne[] = {n_heads_log2_floor}; |
|
size_t tmp_arange1_nb[] = {sizeof(dst->type)}; |
|
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor( |
|
tmp_arange_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb, |
|
GGML_MAX_DIMS - 3, ACL_FORMAT_ND); |
|
|
|
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange); |
|
|
|
aclTensor* tmp_arange2_tensor = nullptr; |
|
if (n_heads_log2_floor < ne2_ne3) { |
|
|
|
start = 1; |
|
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1; |
|
step = 2; |
|
n_elements_arange = ne2_ne3 - n_heads_log2_floor; |
|
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor}; |
|
size_t tmp_arange2_nb[] = {sizeof(dst->type)}; |
|
|
|
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor( |
|
(char*)tmp_arange_buffer + |
|
n_heads_log2_floor * ggml_type_size(dst->type), |
|
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), |
|
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); |
|
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step, |
|
n_elements_arange); |
|
} |
|
|
|
|
|
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(), |
|
ne2_ne3 * ggml_type_size(dst->type)); |
|
void* tmp_mk_base_buffer = mk_base_allocator.get(); |
|
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor}; |
|
size_t tmp_mk_base1_nb[] = {sizeof(dst->type)}; |
|
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor( |
|
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb, |
|
GGML_MAX_DIMS - 3, ACL_FORMAT_ND); |
|
|
|
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor); |
|
|
|
aclTensor* tmp_mk_base2_tensor = nullptr; |
|
if (n_heads_log2_floor < ne2_ne3) { |
|
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor}; |
|
size_t tmp_mk_base2_nb[] = {sizeof(dst->type)}; |
|
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor( |
|
(char*)tmp_mk_base_buffer + |
|
n_heads_log2_floor * ggml_type_size(dst->type), |
|
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), |
|
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); |
|
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor); |
|
} |
|
|
|
|
|
int64_t tmp_mk_base_ne[] = {ne2_ne3}; |
|
size_t tmp_mk_base_nb[] = {sizeof(dst->type)}; |
|
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor( |
|
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb, |
|
GGML_MAX_DIMS - 3, ACL_FORMAT_ND); |
|
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor( |
|
tmp_arange_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb, |
|
GGML_MAX_DIMS - 3, ACL_FORMAT_ND); |
|
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor); |
|
|
|
|
|
int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]}; |
|
size_t tmp_mk_nb[GGML_MAX_DIMS]; |
|
tmp_mk_nb[0] = ggml_type_size(dst->type); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1]; |
|
} |
|
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor( |
|
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS, |
|
ACL_FORMAT_ND); |
|
|
|
|
|
int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]}; |
|
size_t tmp_output_nb[GGML_MAX_DIMS]; |
|
tmp_output_nb[0] = ggml_type_size(dst->type); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1]; |
|
} |
|
ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst)); |
|
void* tmp_output_buffer = output_allocator.get(); |
|
aclTensor* tmp_output_tensor = ggml_cann_create_tensor( |
|
tmp_output_buffer, ggml_cann_type_mapping(dst->type), |
|
ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS, |
|
ACL_FORMAT_ND); |
|
aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor); |
|
|
|
|
|
aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst); |
|
|
|
ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_arange_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_mk_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_output_tensor)); |
|
} |
|
|
|
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_cann_dup(ctx, dst); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_inplace_add(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_src, aclTensor* acl_dst) { |
|
aclScalar* alpha = nullptr; |
|
float alphaValue = 1.0f; |
|
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyScalar(alpha)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
int64_t dim, aclTensor* acl_dst) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst, |
|
&workspaceSize, &executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
aclrtStream stream = ctx.stream(); |
|
ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream)); |
|
} |
|
|
|
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
|
|
aclTensor* acl_src0 = ggml_cann_create_tensor(src0); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
float scale = 1.0f; |
|
float max_bias = 0.0f; |
|
|
|
memcpy(&scale, (float*)dst->op_params + 0, sizeof(float)); |
|
memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float)); |
|
|
|
|
|
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); |
|
|
|
size_t n_bytes = ggml_nbytes(src0); |
|
ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes); |
|
void* input_mul_scale_buffer = mul_scale_allocator.get(); |
|
aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor( |
|
input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne, |
|
src0->nb, GGML_MAX_DIMS); |
|
|
|
bool inplace = false; |
|
aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace); |
|
|
|
|
|
aclTensor* acl_src1_fp32_tensor = nullptr; |
|
aclTensor* tmp_mask_tensor = nullptr; |
|
ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool()); |
|
if (src1) { |
|
const bool use_f16 = src1->type == GGML_TYPE_F16; |
|
if (use_f16) { |
|
|
|
size_t n_bytes = ggml_nelements(src1) * sizeof(float_t); |
|
size_t src1_fp32_nb[GGML_MAX_DIMS]; |
|
src1_fp32_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1]; |
|
} |
|
src1_fp32_allocator.alloc(n_bytes); |
|
void* src1_fp32_buffer = src1_fp32_allocator.get(); |
|
acl_src1_fp32_tensor = ggml_cann_create_tensor( |
|
src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne, |
|
src1_fp32_nb, GGML_MAX_DIMS); |
|
aclTensor* acl_src1 = ggml_cann_create_tensor(src1); |
|
aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src1)); |
|
} else { |
|
acl_src1_fp32_tensor = ggml_cann_create_tensor(src1); |
|
} |
|
|
|
|
|
if (src1->ne[1] != src0->ne[1]) { |
|
|
|
int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1}; |
|
size_t tmp_mask_nb[GGML_MAX_DIMS]; |
|
tmp_mask_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1]; |
|
} |
|
tmp_mask_tensor = ggml_cann_create_tensor( |
|
src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb, |
|
GGML_MAX_DIMS, ACL_FORMAT_ND); |
|
} |
|
|
|
|
|
const int n_head = src0->ne[2]; |
|
const size_t src_nb0 = src0->nb[0]; |
|
|
|
n_bytes = ggml_nbytes(dst); |
|
ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes); |
|
void* output_buffer = output_allocator.get(); |
|
aclTensor* alibi_output_tensor = ggml_cann_create_tensor( |
|
output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne, |
|
dst->nb, GGML_MAX_DIMS); |
|
if (max_bias <= 0.0f) { |
|
|
|
if (tmp_mask_tensor) { |
|
aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor, |
|
alibi_output_tensor); |
|
} else { |
|
aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor, |
|
alibi_output_tensor); |
|
} |
|
} else { |
|
|
|
if (tmp_mask_tensor) { |
|
aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor, |
|
alibi_output_tensor, n_head, src0->ne, src_nb0, |
|
max_bias, dst); |
|
} else { |
|
aclnn_alibi(ctx, acl_input_mul_scale_tensor, |
|
acl_src1_fp32_tensor, alibi_output_tensor, n_head, |
|
src0->ne, src_nb0, max_bias, dst); |
|
} |
|
} |
|
|
|
|
|
aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst); |
|
ACL_CHECK(aclDestroyTensor(alibi_output_tensor)); |
|
} else { |
|
aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst); |
|
} |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src0)); |
|
ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
ACL_CHECK(aclDestroyScalar(acl_scale)); |
|
ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor)); |
|
ACL_CHECK(aclDestroyTensor(tmp_mask_tensor)); |
|
} |
|
|
|
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
|
|
ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); |
|
ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); |
|
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); |
|
src0->extra = src0_extra_allocator.get(); |
|
src1->extra = src1_extra_allocator.get(); |
|
dst->extra = dst_extra_allocator.get(); |
|
ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0, |
|
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, |
|
ctx.stream())); |
|
ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1, |
|
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, |
|
ctx.stream())); |
|
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, |
|
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, |
|
ctx.stream())); |
|
|
|
switch (src0->type) { |
|
case GGML_TYPE_F32: { |
|
#ifdef ASCEND_310P |
|
|
|
|
|
if ((src0->ne[0] % 8) != 0) { |
|
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * |
|
src0->ne[0] * ggml_type_size(GGML_TYPE_F32); |
|
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); |
|
} |
|
#endif |
|
aclrtlaunch_ascendc_get_row_f32( |
|
24, ctx.stream(), src0->data, src1->data, dst->data, |
|
((ggml_tensor*)src0->extra)->ne, |
|
((ggml_tensor*)src0->extra)->nb, |
|
((ggml_tensor*)src1->extra)->ne, |
|
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
break; |
|
} |
|
case GGML_TYPE_F16: { |
|
#ifdef ASCEND_310P |
|
|
|
|
|
if ((src0->ne[0] % 16) != 0) { |
|
size_t dst_len = |
|
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * |
|
ggml_type_size( |
|
GGML_TYPE_F32); |
|
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); |
|
} |
|
#endif |
|
aclrtlaunch_ascendc_get_row_f16( |
|
24, ctx.stream(), src0->data, src1->data, dst->data, |
|
((ggml_tensor*)src0->extra)->ne, |
|
((ggml_tensor*)src0->extra)->nb, |
|
((ggml_tensor*)src1->extra)->ne, |
|
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
break; |
|
} |
|
case GGML_TYPE_Q4_0: |
|
aclrtlaunch_ascendc_get_row_q4_0( |
|
24, ctx.stream(), src0->data, src1->data, dst->data, |
|
((ggml_tensor*)src0->extra)->ne, |
|
((ggml_tensor*)src1->extra)->ne, |
|
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
break; |
|
case GGML_TYPE_Q8_0: |
|
aclrtlaunch_ascendc_get_row_q8_0( |
|
24, ctx.stream(), src0->data, src1->data, dst->data, |
|
((ggml_tensor*)src0->extra)->ne, |
|
((ggml_tensor*)src1->extra)->ne, |
|
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, |
|
((ggml_tensor*)dst->extra)->nb); |
|
break; |
|
default: |
|
GGML_ABORT("fatal error"); |
|
break; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_src, aclTensor* acl_dst, |
|
int64_t dim, int64_t repeats, |
|
int64_t output_size) { |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize( |
|
acl_src, repeats, dim, output_size, acl_dst, &workspaceSize, |
|
&executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize, |
|
executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, |
|
aclTensor* acl_weight, aclTensor* acl_dst) { |
|
int8_t cube_math_type = 1; |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst, |
|
cube_math_type, &workspaceSize, |
|
&executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_input, aclTensor* acl_weight, |
|
aclTensor* acl_dst) { |
|
int8_t cube_math_type = 2; |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst, |
|
cube_math_type, &workspaceSize, |
|
&executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_input, aclTensor* acl_weight, |
|
aclTensor* acl_dst) { |
|
int8_t cube_math_type = 2; |
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst, |
|
cube_math_type, &workspaceSize, |
|
&executor)); |
|
|
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK( |
|
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, |
|
ggml_tensor* dst) { |
|
ggml_tensor* weight = dst->src[0]; |
|
ggml_tensor* input = dst->src[1]; |
|
|
|
|
|
|
|
BCAST_MUL_MAT_SHAPE(input, weight, dst); |
|
|
|
int64_t n_dims = bcast_dims; |
|
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) { |
|
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) { |
|
n_dims = 2; |
|
} else if (bcast_input_ne[2] == 1) { |
|
n_dims = 3; |
|
} |
|
} |
|
|
|
aclTensor* acl_input_tensor = |
|
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims); |
|
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], |
|
bcast_weight_ne[2], bcast_weight_ne[3], |
|
bcast_weight_ne[4], bcast_weight_ne[5]}; |
|
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], |
|
bcast_weight_nb[2], bcast_weight_nb[3], |
|
bcast_weight_nb[4], bcast_weight_nb[5]}; |
|
aclTensor* acl_weight_tensor = |
|
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); |
|
aclTensor* acl_dst = |
|
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); |
|
|
|
switch (n_dims) { |
|
case 2: |
|
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); |
|
break; |
|
case 3: |
|
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); |
|
break; |
|
default: |
|
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); |
|
break; |
|
} |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, |
|
ggml_tensor* dst, |
|
const enum ggml_type type) { |
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
|
|
|
|
|
|
|
|
|
|
float weight_elem_size; |
|
if (type == GGML_TYPE_Q4_0) { |
|
weight_elem_size = float(sizeof(uint8_t)) / 2; |
|
} else if (type == GGML_TYPE_Q8_0) { |
|
weight_elem_size = float(sizeof(uint8_t)); |
|
} else { |
|
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); |
|
} |
|
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size}; |
|
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; |
|
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; |
|
|
|
|
|
size_t scale_elem_size = sizeof(uint16_t); |
|
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, |
|
scale_elem_size}; |
|
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; |
|
char* scale_offset = (char*)src0->data + weight_size; |
|
|
|
|
|
size_t input_elem_size = sizeof(uint16_t); |
|
int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; |
|
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size}; |
|
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size; |
|
ggml_cann_pool_alloc input_alloctor(ctx.pool()); |
|
void* input_buffer = src1->data; |
|
|
|
|
|
if (src1->type != GGML_TYPE_F16) { |
|
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); |
|
input_buffer = |
|
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); |
|
|
|
int64_t* input_cast_ne = src1->ne; |
|
size_t input_cast_nb[GGML_MAX_DIMS]; |
|
input_cast_nb[0] = sizeof(uint16_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1]; |
|
} |
|
|
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor( |
|
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, |
|
input_cast_nb, GGML_MAX_DIMS); |
|
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); |
|
} |
|
|
|
|
|
size_t output_elem_size = sizeof(uint16_t); |
|
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size}; |
|
ggml_cann_pool_alloc output_allocator(ctx.pool()); |
|
void* output_buffer = |
|
output_allocator.alloc(ggml_nelements(dst) * output_elem_size); |
|
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size; |
|
|
|
|
|
int64_t max_elem_size = 65535; |
|
int64_t split_size = (src0->ne[1] / max_elem_size) + 1; |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool()); |
|
aclOpExecutor* executor = nullptr; |
|
uint64_t workspaceSize = 0; |
|
void* workspaceAddr = nullptr; |
|
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { |
|
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { |
|
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); |
|
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); |
|
|
|
int64_t batch1 = (n1 * src1->ne[2]) + c1; |
|
int64_t batch0 = (n0 * src0->ne[2]) + c0; |
|
|
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor( |
|
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, |
|
input_elem_size, input_ne, input_nb, 2); |
|
|
|
|
|
int64_t weight_ne_offset = 0; |
|
int64_t weight_ne[2] = { |
|
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, |
|
src0->ne[0]}; |
|
int64_t scale_ne_offset = 0; |
|
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0}; |
|
int64_t output_ne_offset = 0; |
|
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]}; |
|
|
|
aclTensor* acl_weight_tensor = ggml_cann_create_tensor( |
|
(char*)src0->data + batch0 * weight_stride, |
|
ggml_cann_type_mapping(type), weight_elem_size, weight_ne, |
|
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); |
|
aclTensor* acl_scale_tensor = ggml_cann_create_tensor( |
|
scale_offset + batch0 * scale_stride, ACL_FLOAT16, |
|
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, |
|
scale_ne_offset); |
|
aclTensor* acl_output_tensor = ggml_cann_create_tensor( |
|
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, |
|
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, |
|
output_ne_offset); |
|
|
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( |
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, |
|
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, |
|
&workspaceSize, &executor)); |
|
if (workspaceAddr == nullptr) { |
|
workspaceAddr = workspace_allocator.alloc(workspaceSize); |
|
} |
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2( |
|
workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_output_tensor)); |
|
|
|
|
|
for (int64_t split = 1; split < split_size; split++) { |
|
weight_ne_offset += |
|
weight_elem_size * weight_ne[0] * weight_ne[1]; |
|
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] |
|
? src0->ne[1] - (max_elem_size * split) |
|
: max_elem_size; |
|
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1]; |
|
scale_ne[0] = weight_ne[0]; |
|
output_ne_offset += |
|
output_elem_size * output_ne[0] * output_ne[1]; |
|
output_ne[0] = weight_ne[0]; |
|
|
|
acl_weight_tensor = ggml_cann_create_tensor( |
|
(char*)src0->data + batch0 * weight_stride, |
|
ggml_cann_type_mapping(type), weight_elem_size, weight_ne, |
|
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); |
|
acl_scale_tensor = ggml_cann_create_tensor( |
|
scale_offset + batch0 * scale_stride, ACL_FLOAT16, |
|
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, |
|
scale_ne_offset); |
|
acl_output_tensor = ggml_cann_create_tensor( |
|
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, |
|
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, |
|
output_ne_offset); |
|
|
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( |
|
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, |
|
nullptr, nullptr, nullptr, nullptr, QK8_0, |
|
acl_output_tensor, &workspaceSize, &executor)); |
|
ACL_CHECK(aclnnWeightQuantBatchMatmulV2( |
|
workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_output_tensor)); |
|
} |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); |
|
} |
|
} |
|
|
|
|
|
if (dst->type != GGML_TYPE_F16) { |
|
int64_t* output_cast_ne = dst->ne; |
|
size_t output_cast_nb[GGML_MAX_DIMS]; |
|
output_cast_nb[0] = sizeof(uint16_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; |
|
} |
|
|
|
aclTensor* acl_output_tensor = ggml_cann_create_tensor( |
|
output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne, |
|
output_cast_nb, GGML_MAX_DIMS); |
|
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); |
|
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, |
|
ggml_cann_type_mapping(dst->type)); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_output_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); |
|
} |
|
} |
|
|
|
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
const enum ggml_type type = dst->src[0]->type; |
|
switch (type) { |
|
case GGML_TYPE_F32: |
|
case GGML_TYPE_F16: |
|
ggml_cann_mat_mul_fp(ctx, dst); |
|
break; |
|
case GGML_TYPE_Q4_0: |
|
case GGML_TYPE_Q8_0: |
|
ggml_cann_mul_mat_quant(ctx, dst, type); |
|
break; |
|
default: |
|
GGML_ABORT("fatal error"); |
|
break; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, |
|
aclTensor* acl_dst, int64_t* shifts, int64_t* dims) { |
|
aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1); |
|
aclIntArray* acl_dims = aclCreateIntArray(dims, 1); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst, |
|
&workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyIntArray(acl_shifts)); |
|
ACL_CHECK(aclDestroyIntArray(acl_dims)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, |
|
aclTensor* acl_src, int64_t dim, |
|
int64_t* index, int64_t index_num, |
|
float value) { |
|
aclIntArray* acl_index = aclCreateIntArray(index, index_num); |
|
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
void* workspaceAddr = nullptr; |
|
|
|
ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize( |
|
acl_src, dim, acl_index, acl_value, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize, |
|
executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyIntArray(acl_index)); |
|
ACL_CHECK(aclDestroyScalar(acl_value)); |
|
} |
|
|
|
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, |
|
aclTensor* acl_cos_repeat_tensor, |
|
aclTensor* acl_sin_repeat_tensor, |
|
float theta_scale, float freq_scale, |
|
float attn_factor, bool is_neox) { |
|
|
|
|
|
|
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src1 = dst->src[1]; |
|
ggml_tensor* src2 = dst->src[2]; |
|
|
|
|
|
int64_t arange_length = src0->ne[0] / 2; |
|
ggml_cann_pool_alloc arange_allocator(ctx.pool(), |
|
arange_length * sizeof(float_t)); |
|
void* arange_buffer = arange_allocator.get(); |
|
int64_t arange_ne[] = {arange_length, 1, 1, 1}; |
|
size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t), |
|
arange_length * sizeof(float_t)}; |
|
|
|
aclTensor* acl_arange_tensor = |
|
ggml_cann_create_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t), |
|
arange_ne, arange_nb, GGML_MAX_DIMS); |
|
float start = 0; |
|
float step = 1; |
|
float stop = src0->ne[0] / 2; |
|
float n_elements = src0->ne[0] / 2; |
|
aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), |
|
arange_length * sizeof(float_t)); |
|
void* theta_scale_buffer = theta_scale_allocator.get(); |
|
aclTensor* acl_theta_scale_tensor = aclnn_values( |
|
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne, |
|
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale); |
|
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor); |
|
|
|
|
|
if (freq_scale != 1) { |
|
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); |
|
} |
|
|
|
|
|
if (src2) { |
|
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( |
|
src2->data, ggml_cann_type_mapping(src2->type), |
|
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS); |
|
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, |
|
nullptr, true); |
|
ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor)); |
|
} |
|
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_I32); |
|
int64_t position_length = src1->ne[0]; |
|
int64_t position_ne[] = {1, position_length, 1, 1}; |
|
size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), |
|
sizeof(int32_t) * position_length, |
|
sizeof(int32_t) * position_length}; |
|
aclTensor* acl_position_tensor = ggml_cann_create_tensor( |
|
src1->data, ggml_cann_type_mapping(src1->type), |
|
ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); |
|
|
|
|
|
int64_t theta_length = arange_length * position_length; |
|
ggml_cann_pool_alloc theta_allocator(ctx.pool(), |
|
theta_length * sizeof(float_t)); |
|
void* theta_buffer = theta_allocator.get(); |
|
int64_t theta_ne[] = {arange_length, position_length, 1, 1}; |
|
size_t theta_nb[GGML_MAX_DIMS]; |
|
theta_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; |
|
} |
|
aclTensor* acl_theta_tensor = |
|
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), |
|
theta_ne, theta_nb, GGML_MAX_DIMS); |
|
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, |
|
acl_theta_tensor); |
|
|
|
|
|
int64_t permute_ne[] = {arange_length, 1, position_length, 1}; |
|
size_t permute_nb[GGML_MAX_DIMS]; |
|
permute_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1]; |
|
} |
|
ggml_cann_pool_alloc permute_allocator(ctx.pool(), |
|
theta_length * sizeof(float_t)); |
|
void* permute_buffer = permute_allocator.get(); |
|
aclTensor* acl_permute_tensor = ggml_cann_create_tensor( |
|
permute_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb, |
|
GGML_MAX_DIMS, ACL_FORMAT_ND); |
|
int64_t permute_dim[] = {0, 2, 1, 3}; |
|
int64_t num_dims = 4; |
|
aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim, |
|
num_dims); |
|
|
|
|
|
ggml_cann_pool_alloc sin_allocator(ctx.pool(), |
|
theta_length * sizeof(float_t)); |
|
void* sin_buffer = sin_allocator.get(); |
|
aclTensor* acl_sin_tensor = ggml_cann_create_tensor( |
|
sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb, |
|
GGML_MAX_DIMS, ACL_FORMAT_ND); |
|
aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor); |
|
|
|
ggml_cann_pool_alloc cos_allocator(ctx.pool(), |
|
theta_length * sizeof(float_t)); |
|
void* cos_buffer = cos_allocator.get(); |
|
aclTensor* acl_cos_tensor = ggml_cann_create_tensor( |
|
cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb, |
|
GGML_MAX_DIMS, ACL_FORMAT_ND); |
|
aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor); |
|
|
|
|
|
if (attn_factor != 1) { |
|
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true); |
|
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true); |
|
} |
|
|
|
|
|
if (is_neox) { |
|
int64_t repeatsArray[] = {1, 1, 1, 2}; |
|
aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray); |
|
aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray); |
|
} else { |
|
int64_t num_repeats = 2; |
|
int64_t dim = 3; |
|
int64_t output_size = arange_length * num_repeats; |
|
aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim, |
|
num_repeats, output_size); |
|
aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim, |
|
num_repeats, output_size); |
|
} |
|
|
|
|
|
ACL_CHECK(aclDestroyTensor(acl_arange_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_position_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_theta_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_permute_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_sin_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_cos_tensor)); |
|
} |
|
|
|
#ifdef __cplusplus |
|
extern "C" { |
|
#endif |
|
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize( |
|
const aclTensor* x, const aclTensor* cos, const aclTensor* sin, |
|
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize, |
|
aclOpExecutor** executor); |
|
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace, |
|
uint64_t workspaceSize, |
|
aclOpExecutor* executor, |
|
aclrtStream stream); |
|
#ifdef __cplusplus |
|
} |
|
#endif |
|
|
|
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { |
|
|
|
|
|
ggml_tensor* src0 = dst->src[0]; |
|
ggml_tensor* src2 = dst->src[2]; |
|
|
|
|
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; |
|
|
|
const int n_dims = ((int32_t*)dst->op_params)[1]; |
|
const int mode = ((int32_t*)dst->op_params)[2]; |
|
|
|
const int n_ctx_orig = ((int32_t*)dst->op_params)[4]; |
|
|
|
GGML_TENSOR_UNARY_OP_LOCALS |
|
|
|
memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float)); |
|
memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float)); |
|
memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float)); |
|
memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float)); |
|
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); |
|
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); |
|
|
|
|
|
GGML_ASSERT(n_dims == ne0); |
|
GGML_ASSERT(n_dims % 2 == 0); |
|
|
|
GGML_ASSERT(ext_factor == 0); |
|
|
|
const float theta_scale = powf(freq_base, -2.0f / n_dims); |
|
|
|
float corr_dims[2]; |
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, |
|
beta_slow, corr_dims); |
|
|
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; |
|
|
|
|
|
ggml_cann_pool_alloc sin_allocator( |
|
ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); |
|
ggml_cann_pool_alloc cos_allocator( |
|
ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); |
|
void* sin_buffer = sin_allocator.get(); |
|
void* cos_buffer = cos_allocator.get(); |
|
|
|
int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; |
|
size_t sin_reshape_nb[GGML_MAX_DIMS]; |
|
sin_reshape_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; |
|
} |
|
aclTensor* acl_sin_reshape_tensor = |
|
ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t), |
|
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); |
|
aclTensor* acl_cos_reshape_tensor = |
|
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), |
|
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); |
|
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, |
|
theta_scale, freq_scale, attn_factor, is_neox); |
|
|
|
aclTensor* acl_src = ggml_cann_create_tensor(src0); |
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst); |
|
|
|
#ifdef ASCEND_310P |
|
|
|
|
|
|
|
void* input_roll_buffer; |
|
aclTensor* acl_minus_one_tensor; |
|
void* minus_one_scale_buffer = nullptr; |
|
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); |
|
ggml_cann_pool_alloc minus_one_scale_allocator( |
|
ctx.pool(), sizeof(float_t) * src0->ne[0]); |
|
if (!is_neox) { |
|
|
|
input_roll_buffer = roll_allocator.get(); |
|
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2), |
|
src0->ne[2], src0->ne[3]}; |
|
size_t input_roll_nb[GGML_MAX_DIMS]; |
|
input_roll_nb[0] = ggml_type_size(src0->type); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; |
|
} |
|
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( |
|
input_roll_buffer, ggml_cann_type_mapping(src0->type), |
|
ggml_type_size(src0->type), input_roll_ne, input_roll_nb, |
|
GGML_MAX_DIMS); |
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor( |
|
src0->data, ggml_cann_type_mapping(src0->type), |
|
ggml_type_size(src0->type), input_roll_ne, input_roll_nb, |
|
GGML_MAX_DIMS); |
|
|
|
int64_t shifts[] = {1}; |
|
int64_t dims[] = {3}; |
|
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); |
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); |
|
|
|
|
|
minus_one_scale_buffer = minus_one_scale_allocator.get(); |
|
|
|
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; |
|
size_t minus_one_nb[GGML_MAX_DIMS]; |
|
minus_one_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; |
|
} |
|
acl_minus_one_tensor = aclnn_values( |
|
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], |
|
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); |
|
int64_t dim = 3; |
|
int64_t* index = new int64_t[src0->ne[0]]; |
|
for (int i = 0; i < src0->ne[0]; i++) { |
|
index[i] = i / 2 * 2; |
|
} |
|
int64_t index_num = src0->ne[0]; |
|
float value = -1; |
|
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, |
|
index_num, value); |
|
} else { |
|
|
|
|
|
input_roll_buffer = roll_allocator.get(); |
|
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( |
|
input_roll_buffer, ggml_cann_type_mapping(src0->type), |
|
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); |
|
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0); |
|
|
|
int64_t shifts[] = {src0->ne[0] / 2}; |
|
int64_t dims[] = {3}; |
|
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); |
|
|
|
minus_one_scale_buffer = minus_one_scale_allocator.get(); |
|
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; |
|
size_t minus_one_nb[GGML_MAX_DIMS]; |
|
minus_one_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; |
|
} |
|
acl_minus_one_tensor = aclnn_values( |
|
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], |
|
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); |
|
|
|
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; |
|
size_t first_half_nb[GGML_MAX_DIMS]; |
|
first_half_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; |
|
} |
|
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor( |
|
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne, |
|
first_half_nb, GGML_MAX_DIMS); |
|
bool inplace = true; |
|
float scale = -1; |
|
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace); |
|
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor)); |
|
} |
|
|
|
|
|
GGML_ASSERT(n_dims == src0->ne[0]); |
|
|
|
|
|
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), |
|
ggml_nbytes(src0)); |
|
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); |
|
size_t input_nb[GGML_MAX_DIMS]; |
|
input_nb[0] = ggml_type_size(src0->type); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; |
|
} |
|
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor( |
|
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type), |
|
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); |
|
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor( |
|
input_roll_buffer, ggml_cann_type_mapping(src0->type), |
|
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); |
|
|
|
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, |
|
acl_input_roll_mul_scale_tensor); |
|
|
|
|
|
void* output_fp32_buffer; |
|
if (src0->type == GGML_TYPE_F32) { |
|
aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor); |
|
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, |
|
acl_sin_reshape_tensor); |
|
aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst); |
|
|
|
} else if (src0->type == GGML_TYPE_F16) { |
|
size_t input_fp32_nb[GGML_MAX_DIMS]; |
|
input_fp32_nb[0] = sizeof(float_t); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; |
|
} |
|
ggml_cann_pool_alloc fp32_allocator1( |
|
ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); |
|
void* input_fp32_buffer1 = fp32_allocator1.get(); |
|
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor( |
|
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne, |
|
input_fp32_nb, GGML_MAX_DIMS); |
|
ggml_cann_pool_alloc fp32_allocator2( |
|
ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); |
|
void* input_fp32_buffer2 = fp32_allocator2.get(); |
|
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor( |
|
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne, |
|
input_fp32_nb, GGML_MAX_DIMS); |
|
|
|
ggml_cann_pool_alloc fp32_allocator( |
|
ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); |
|
output_fp32_buffer = fp32_allocator.get(); |
|
aclTensor* output_fp32_tensor = ggml_cann_create_tensor( |
|
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne, |
|
input_fp32_nb, GGML_MAX_DIMS); |
|
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1); |
|
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, |
|
input_fp32_tensor2); |
|
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, |
|
output_fp32_tensor); |
|
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); |
|
|
|
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); |
|
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); |
|
ACL_CHECK(aclDestroyTensor(output_fp32_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
} |
|
return; |
|
#endif |
|
|
|
|
|
|
|
if (src0->type == GGML_TYPE_F16) { |
|
ggml_cann_pool_alloc sin_final_allocator( |
|
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); |
|
ggml_cann_pool_alloc cos_final_allocator( |
|
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); |
|
void* sin_final_buffer = sin_final_allocator.get(); |
|
void* cos_final_buffer = cos_final_allocator.get(); |
|
|
|
int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; |
|
size_t sin_final_nb[GGML_MAX_DIMS]; |
|
sin_final_nb[0] = ggml_type_size(src0->type); |
|
for (int i = 1; i < GGML_MAX_DIMS; i++) { |
|
sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1]; |
|
} |
|
aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor( |
|
sin_final_buffer, ggml_cann_type_mapping(src0->type), |
|
ggml_type_size(src0->type), sin_final_ne, sin_final_nb, |
|
GGML_MAX_DIMS); |
|
aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor( |
|
cos_final_buffer, ggml_cann_type_mapping(src0->type), |
|
ggml_type_size(src0->type), sin_final_ne, sin_final_nb, |
|
GGML_MAX_DIMS); |
|
|
|
aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor, |
|
ggml_cann_type_mapping(src0->type)); |
|
aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor, |
|
ggml_cann_type_mapping(src0->type)); |
|
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); |
|
acl_sin_reshape_tensor = acl_sin_final_tensor; |
|
acl_cos_reshape_tensor = acl_cos_final_tensor; |
|
} |
|
|
|
uint64_t workspaceSize = 0; |
|
aclOpExecutor* executor; |
|
|
|
void* workspaceAddr = nullptr; |
|
|
|
int acl_mode = mode; |
|
if (mode == 0) { |
|
acl_mode = 1; |
|
} |
|
|
|
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize( |
|
acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, |
|
acl_dst, &workspaceSize, &executor)); |
|
if (workspaceSize > 0) { |
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); |
|
workspaceAddr = workspace_allocator.get(); |
|
} |
|
|
|
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize, |
|
executor, ctx.stream())); |
|
|
|
ACL_CHECK(aclDestroyTensor(acl_src)); |
|
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); |
|
ACL_CHECK(aclDestroyTensor(acl_dst)); |
|
} |
|
|