llama : fix kv_cache n init (close #1903)

2024-11-09 23:29:44 +00:00 · 2023-06-17 19:30:22 +03:00 · 2023-06-17 19:30:22 +03:00 · 051e1b0e6a
commit 051e1b0e6a
parent 86c7571864
3 changed files with 4 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -34,6 +34,7 @@ models/*
 /perplexity
 /embedding
 /train-text-from-scratch
 /simple
 /benchmark-matmult
 /vdot
 /server
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -38,6 +38,7 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(simple)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/llama.cpp
+++ b/llama.cpp
@ -886,6 +886,7 @@ static bool kv_cache_init(
    const int64_t n_elements = n_embd*n_mem;
    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
    cache.n = 0;
    struct ggml_init_params params;
    params.mem_size   = cache.buf.size;
@ -904,6 +905,7 @@ static bool kv_cache_init(
    ggml_set_name(cache.k, "cache_k");
    ggml_set_name(cache.v, "cache_v");
    (void) n_gpu_layers;
 #ifdef GGML_USE_CUBLAS
    if (n_gpu_layers > n_layer + 1) {
        ggml_cuda_assign_buffers_no_scratch(cache.v);