From 69c92298a9e36dc2363b3bf50452976ce49487b3 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Wed, 22 Mar 2023 17:29:06 +0000 Subject: [PATCH] Deduplicate q4 quantization functions (#383) * Deduplicate q4 quantization functions * Use const; add basic test * Re-enable quantization test * Disable AVX2 flags in CI --------- Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 2 +- ggml.c | 171 ++++++++++++++---------------------- ggml.h | 4 +- tests/CMakeLists.txt | 13 ++- tests/test-quantize.c | 42 +++++++++ 5 files changed, 119 insertions(+), 113 deletions(-) create mode 100644 tests/test-quantize.c diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5882fc7..6ce9cc7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -89,7 +89,7 @@ jobs: run: | mkdir build cd build - cmake .. + cmake -DLLAMA_AVX2=OFF .. cmake --build . --config Release ctest --output-on-failure diff --git a/ggml.c b/ggml.c index 7ea9f62..0e4b146 100644 --- a/ggml.c +++ b/ggml.c @@ -403,9 +403,55 @@ static inline __m128i packNibbles( __m256i bytes ) // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) + +// reference implementation for deterministic creation of model files +static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) { + assert(k % QK == 0); + const int nb = k / QK; + + const size_t bs = sizeof(float) + QK/2; + + uint8_t * restrict pd = ((uint8_t *)y + 0*bs); + uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); + + uint8_t pp[QK/2]; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int l = 0; l < QK; l++) { + const float v = x[i*QK + l]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 3) - 1); + const float id = d ? 1.0f/d : 0.0f; + + *(float *)pd = d; + pd += bs; + + for (int l = 0; l < QK; l += 2) { + const float v0 = x[i*QK + l + 0]*id; + const float v1 = x[i*QK + l + 1]*id; + + const uint8_t vi0 = ((int8_t) (round(v0))) + 8; + const uint8_t vi1 = ((int8_t) (round(v1))) + 8; + + assert(vi0 >= 0 && vi0 < 16); + assert(vi1 >= 0 && vi1 < 16); + + pp[l/2] = vi0 | (vi1 << 4); + } + + memcpy(pb, pp, sizeof(pp)); + pb += bs; + } +} + void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { assert(k % QK == 0); +#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) const int nb = k / QK; const size_t bs = sizeof(float) + QK/2; @@ -413,6 +459,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); uint8_t pp[QK/2]; +#endif #if __ARM_NEON #if QK == 32 @@ -569,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { #endif #else // scalar - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; - amax = MAX(amax, fabsf(v)); - } - - const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0f/d : 0.0f; - - *(float *)pd = d; - pd += bs; - - for (int l = 0; l < QK; l += 2) { - const float v0 = x[i*QK + l + 0]*id; - const float v1 = x[i*QK + l + 1]*id; - - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb, pp, sizeof(pp)); - pb += bs; - } + quantize_row_q4_0_reference(x, y, k); #endif } @@ -10705,119 +10723,60 @@ enum ggml_opt_result ggml_opt( //////////////////////////////////////////////////////////////////////////////// -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) { const int nb = k / qk; const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); const size_t row_size = nb*bs; assert(k % qk == 0); - const size_t pp_size = qk / 2; - uint8_t * pp = (uint8_t *) alloca(pp_size); - char * pdst = (char *) dst; for (int j = 0; j < n; j += k) { uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); + quantize_row_q4_0_reference(src + j, pd, k); + for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max + for (int l = 0; l < qk; l += 2) { + const uint8_t vi0 = pb[l/2] & 0xF; + const uint8_t vi1 = pb[l/2] >> 4; - { - for (int l = 0; l < qk; l++) { - const float v = src[j + i*qk + l]; - amax = MAX(amax, fabsf(v)); - } - - const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0f/d : 0.0f; - - *(float *) pd = d; - pd += bs; - - for (int l = 0; l < qk; l += 2) { - const float v0 = (src[j + i*qk + l + 0])*id; - const float v1 = (src[j + i*qk + l + 1])*id; - - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - hist[vi0]++; - hist[vi1]++; - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb, pp, pp_size); - pb += bs; + hist[vi0]++; + hist[vi1]++; } + pb += bs; } } return (n/k)*row_size; } -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) { const int nb = k / qk; const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); const size_t row_size = nb*bs; assert(k % qk == 0); - const size_t pp_size = qk / 2; - uint8_t * pp = (uint8_t *) alloca(pp_size); - char * pdst = (char *) dst; for (int j = 0; j < n; j += k) { uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); - uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float)); - //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); + quantize_row_q4_1(src + j, pd, k); for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; + for (int l = 0; l < qk; l += 2) { + const uint8_t vi0 = pb[l/2] & 0xF; + const uint8_t vi1 = pb[l/2] >> 4; - { - for (int l = 0; l < qk; l++) { - const float v = src[j + i*qk + l]; - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - *(float *) pd = d; - *(float *) pm = min; - pd += bs; - pm += bs; - - for (int l = 0; l < qk; l += 2) { - const float v0 = (src[j + i*qk + l + 0] - min)*id; - const float v1 = (src[j + i*qk + l + 1] - min)*id; - - const uint8_t vi0 = round(v0); - const uint8_t vi1 = round(v1); - - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); - - hist[vi0]++; - hist[vi1]++; - - pp[l/2] = vi0 | (vi1 << 4); - } - - memcpy(pb, pp, pp_size); - pb += bs; + hist[vi0]++; + hist[vi1]++; } + pb += bs; } } diff --git a/ggml.h b/ggml.h index 48b6cc0..c7e6814 100644 --- a/ggml.h +++ b/ggml.h @@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt( // quantization // -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist); // // system info diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4990c34..6a4170f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,4 +1,9 @@ -set(TEST_TARGET test-tokenizer-0) -add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) -add_test(NAME ${TEST_TARGET} COMMAND $ ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) +function(llama_add_test source) + get_filename_component(TEST_TARGET ${source} NAME_WE) + add_executable(${TEST_TARGET} ${source}) + target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) + add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) +endfunction() + +llama_add_test(test-quantize.c) +llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) diff --git a/tests/test-quantize.c b/tests/test-quantize.c new file mode 100644 index 0000000..d59ecb8 --- /dev/null +++ b/tests/test-quantize.c @@ -0,0 +1,42 @@ +#include "ggml.h" +#undef NDEBUG +#include +#include + +int main(void) { + #define QK 32 + float src[QK]; + uint8_t dst[24]; + int64_t hist[16]; + + for (int i = 0; i < QK; i++) { + src[i] = (float)(i + 1); + } + + size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist); + assert(size == 20); + float max_result = ((float *)dst)[0]; + float max_expected = src[31] / ((1 << 3) - 1); + assert(max_result == max_expected); + for (int i = 0; i < QK; i++) { + uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF); + uint8_t q4_expected = roundf(src[i] / max_expected) + 8; + assert(q4_result == q4_expected); + } + + size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist); + assert(size == 24); + float delta_result = ((float *)dst)[0]; + float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1); + assert(delta_result == delta_expected); + float min_result = ((float *)dst)[1]; + float min_expected = src[0]; + assert(min_result == min_expected); + for (int i = 0; i < QK; i++) { + uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF); + uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected); + assert(q4_result == q4_expected); + } + + return 0; +}