mirror of
https://git.adityakumar.xyz/llama.cpp.git
synced 2024-11-09 23:29:44 +00:00
llama : temporary disable Q6_K output quantization (#1711)
This commit is contained in:
parent
590250f7a9
commit
7a74dee6b4
1 changed files with 9 additions and 4 deletions
13
llama.cpp
13
llama.cpp
|
@ -2198,8 +2198,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||||
} else {
|
} else {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
|
// TODO: temporary disabled until Metal / OpenCL support is available
|
||||||
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
|
||||||
|
//if (tensor.name == "output.weight") {
|
||||||
|
// new_type = GGML_TYPE_Q6_K;
|
||||||
|
//}
|
||||||
|
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
|
@ -2207,7 +2211,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
}
|
}
|
||||||
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
|
@ -2215,10 +2219,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
}
|
}
|
||||||
else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
|
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
||||||
llama_buffer f32_conv_buf;
|
llama_buffer f32_conv_buf;
|
||||||
|
|
Loading…
Reference in a new issue