mirror of
https://git.adityakumar.xyz/llama.cpp.git
synced 2024-11-09 15:29:43 +00:00
Fix Metal backend broken from the allocator changes (#2455)
* fix Metal backend broken from the allocator changes
This commit is contained in:
parent
a113689571
commit
9d2382b3e4
1 changed files with 10 additions and 7 deletions
17
llama.cpp
17
llama.cpp
|
@ -1812,6 +1812,12 @@ static bool llama_eval_internal(
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||||
|
|
||||||
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||||
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||||
|
|
||||||
|
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
||||||
|
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
||||||
|
|
||||||
#if GGML_USE_MPI
|
#if GGML_USE_MPI
|
||||||
const int64_t n_layer = hparams.n_layer;
|
const int64_t n_layer = hparams.n_layer;
|
||||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||||
|
@ -1825,7 +1831,10 @@ static bool llama_eval_internal(
|
||||||
//}
|
//}
|
||||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
||||||
|
if (!lctx.embedding.empty()) {
|
||||||
|
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// IMPORTANT:
|
// IMPORTANT:
|
||||||
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
||||||
|
@ -1856,12 +1865,6 @@ static bool llama_eval_internal(
|
||||||
// update kv token count
|
// update kv token count
|
||||||
lctx.kv_self.n = n_past + N;
|
lctx.kv_self.n = n_past + N;
|
||||||
|
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
|
||||||
|
|
||||||
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
|
||||||
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
|
||||||
|
|
||||||
if (cgraph_fname) {
|
if (cgraph_fname) {
|
||||||
ggml_graph_export(gf, cgraph_fname);
|
ggml_graph_export(gf, cgraph_fname);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue