diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6d79a7e..decf41a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -94,7 +94,7 @@ int main(int argc, char ** argv) { // params.prompt = R"(// this function checks if the number n is prime //bool is_prime(int n) {)"; - + llama_context * ctx; g_ctx = &ctx; diff --git a/ggml.c b/ggml.c index 46c0292..d9a95af 100644 --- a/ggml.c +++ b/ggml.c @@ -2635,15 +2635,15 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0ls)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0hs)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0hs)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1ls)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1hs)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1hs)); + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); diff --git a/llama.cpp b/llama.cpp index 4e92f55..34327ec 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2256,7 +2256,6 @@ std::vector>& llama_internal_get_te // Returns the size of the state size_t llama_get_state_size(struct llama_context * ctx) { - const size_t s_bool = sizeof(int32_t); // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 5a54101..7e091e8 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -120,7 +120,7 @@ int main(int argc, char * argv[]) { ggml_type type = (ggml_type) i; quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); - if (qfns.quantize_row_q) { + if (qfns.quantize_row_q && qfns.dequantize_row_q) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR); num_failed += failed; diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 883df05..d551445 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -225,7 +225,7 @@ int main(int argc, char * argv[]) { continue; } - if (qfns.quantize_row_q) { + if (qfns.quantize_row_q && qfns.dequantize_row_q) { printf("%s\n", ggml_type_name(type)); if (params.op_quantize_row_q_reference) {