llama : temporary disable Q6_K output quantization (#1711)

2024-11-09 23:29:44 +00:00 · 2023-06-06 09:39:38 +03:00 · 2023-06-06 09:39:38 +03:00 · 7a74dee6b4
commit 7a74dee6b4
parent 590250f7a9
1 changed files with 9 additions and 4 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2198,8 +2198,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
        } else {
            new_type = quantized_type;
-            if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
+            // TODO: temporary disabled until Metal / OpenCL support is available
-            else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
+            //       ref: https://github.com/ggerganov/llama.cpp/issues/1711
            //if (tensor.name == "output.weight") {
            //    new_type = GGML_TYPE_Q6_K;
            //}
            if (tensor.name.find("attention.wv.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@ -2207,7 +2211,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                         (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                ++i_attention_wv;
            }
-            else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
+            if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@ -2215,10 +2219,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                         (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                ++i_feed_forward_w2;
            }
-            else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
+            if (tensor.name.find("attention.wo.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
            }
            float * f32_data;
            size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
            llama_buffer f32_conv_buf;