diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 446b8e8..9f9ed9d 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,6 +1,7 @@ -#include #include "ggml.h" #include "build-info.h" + +#include #include #include #include diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index c24f7f8..03603b1 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -31,6 +31,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } + llama_init_backend(); + llama_context * ctx; // load the model diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4d886f8..47b418d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -96,8 +96,7 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } -// params.prompt = R"(// this function checks if the number n is prime -//bool is_prime(int n) {)"; + llama_init_backend(); llama_context * ctx; g_ctx = &ctx; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9d38626..e19c682 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -143,6 +143,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } + llama_init_backend(); + llama_context * ctx; // load the model and apply lora adapter, if any diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 115d8fb..769dd36 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,7 +1,7 @@ -#include "ggml.h" -#include "llama.h" #include "build-info.h" +#include "llama.h" + #include #include #include @@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st // ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // int main(int argc, char ** argv) { - ggml_time_init(); - if (argc < 3) { fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]); for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { @@ -52,12 +50,7 @@ int main(int argc, char ** argv) { return 1; } - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } + llama_init_backend(); // parse command line arguments const std::string fname_inp = argv[1]; @@ -116,25 +109,25 @@ int main(int argc, char ** argv) { } fprintf(stderr, "\n"); - const int64_t t_main_start_us = ggml_time_us(); + const int64_t t_main_start_us = llama_time_us(); int64_t t_quantize_us = 0; // load the model { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_start_us = llama_time_us(); if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } - t_quantize_us = ggml_time_us() - t_start_us; + t_quantize_us = llama_time_us() - t_start_us; } // report timing { - const int64_t t_main_end_us = ggml_time_us(); + const int64_t t_main_end_us = llama_time_us(); printf("\n"); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0); diff --git a/llama.cpp b/llama.cpp index dd44959..5e6980b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -839,6 +839,21 @@ bool llama_mlock_supported() { return llama_mlock::SUPPORTED; } +void llama_init_backend() { + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } +} + +int64_t llama_time_us() { + return ggml_time_us(); +} + // // model loading // diff --git a/llama.h b/llama.h index 8623e08..0a63d03 100644 --- a/llama.h +++ b/llama.h @@ -40,9 +40,9 @@ extern "C" { typedef int llama_token; typedef struct llama_token_data { - llama_token id; // token id - float logit; // log-odds of the token - float p; // probability of the token + llama_token id; // token id + float logit; // log-odds of the token + float p; // probability of the token } llama_token_data; typedef struct llama_token_data_array { @@ -73,16 +73,16 @@ extern "C" { // model file types enum llama_ftype { - LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); @@ -90,6 +90,13 @@ extern "C" { LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mlock_supported(); + // TODO: not great API - very likely to change + // Initialize the llama + ggml backend + // Call once at the start of the program + LLAMA_API void llama_init_backend(); + + LLAMA_API int64_t llama_time_us(); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure