|
#define GGML_COMMON_DECL_C |
|
#include "ggml-common.h" |
|
|
|
#include "ggml-aarch64.h" |
|
#include "ggml-impl.h" |
|
#include "ggml-quants.h" |
|
#include <assert.h> |
|
|
|
#define UNUSED GGML_UNUSED |
|
|
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { |
|
block_q4_0x4 out; |
|
|
|
for (int i = 0; i < 4; i++) { |
|
out.d[i] = in[i].d; |
|
} |
|
|
|
const int end = QK4_0 * 2 / blck_size_interleave; |
|
|
|
if (blck_size_interleave == 8) { |
|
const uint64_t xor_mask = 0x8888888888888888ULL; |
|
for (int i = 0; i < end; ++i) { |
|
int src_id = i % 4; |
|
int src_offset = (i / 4) * blck_size_interleave; |
|
int dst_offset = i * blck_size_interleave; |
|
|
|
uint64_t elems; |
|
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
|
elems ^= xor_mask; |
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
|
} |
|
} else if (blck_size_interleave == 4) { |
|
const uint32_t xor_mask = 0x88888888; |
|
for (int i = 0; i < end; ++i) { |
|
int src_id = i % 4; |
|
int src_offset = (i / 4) * blck_size_interleave; |
|
int dst_offset = i * blck_size_interleave; |
|
|
|
uint32_t elems; |
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); |
|
elems ^= xor_mask; |
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); |
|
} |
|
} else { |
|
GGML_ASSERT(false); |
|
} |
|
|
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { |
|
block_q4_0x8 out; |
|
|
|
for (int i = 0; i < 8; i++) { |
|
out.d[i] = in[i].d; |
|
} |
|
|
|
const int end = QK4_0 * 4 / blck_size_interleave; |
|
const uint64_t xor_mask = 0x8888888888888888ULL; |
|
|
|
for (int i = 0; i < end; ++i) { |
|
int src_id = i % 8; |
|
int src_offset = (i / 8) * blck_size_interleave; |
|
int dst_offset = i * blck_size_interleave; |
|
|
|
uint64_t elems; |
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
|
elems ^= xor_mask; |
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
|
} |
|
|
|
return out; |
|
} |
|
|
|
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) { |
|
assert(n_per_row % QK4_0 == 0); |
|
const int nb = n_per_row / QK4_0; |
|
|
|
void * out_ptr = NULL; |
|
if (nrows_interleaved == 8) { |
|
out_ptr = (block_q4_0x8 *) dst; |
|
} |
|
else if (nrows_interleaved == 4) { |
|
out_ptr = (block_q4_0x4 *) dst; |
|
} |
|
assert(nrows_interleaved <= 8); |
|
block_q4_0 dst_tmp[8]; |
|
|
|
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) { |
|
|
|
for (int64_t x = 0; x < nb; x++) { |
|
|
|
for (int i = 0; i < nrows_interleaved; i++ ) { |
|
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0); |
|
} |
|
|
|
if (nrows_interleaved == 8) { |
|
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave); |
|
out_ptr = (block_q4_0x8 *) out_ptr + 1; |
|
} |
|
else if (nrows_interleaved == 4) { |
|
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave); |
|
out_ptr = (block_q4_0x4 *) out_ptr + 1; |
|
} |
|
} |
|
} |
|
|
|
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0)); |
|
} |
|
|
|
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { |
|
UNUSED(quant_weights); |
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4); |
|
} |
|
|
|
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { |
|
UNUSED(quant_weights); |
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8); |
|
} |
|
|
|
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { |
|
UNUSED(quant_weights); |
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); |
|
} |
|
|