From 84ca9c2ecf3391d911589d0fe2b483cbfb4b82a6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 29 Apr 2023 13:48:11 +0300 Subject: [PATCH] examples : fix save-load-state + rename llama-util.h --- examples/save-load-state/save-load-state.cpp | 72 +++++++++++--------- llama_util.h => llama-util.h | 1 - llama.cpp | 3 +- 3 files changed, 42 insertions(+), 34 deletions(-) rename llama_util.h => llama-util.h (99%) mode change 100755 => 100644 diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 07dfa2c..f5f02ec 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,13 +1,10 @@ +#include "common.h" +#include "llama.h" + #include #include #include -#include "common.h" -#include "llama.h" -#include "llama.cpp" - -using namespace std; - int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; @@ -20,21 +17,25 @@ int main(int argc, char ** argv) { return 1; } + if (params.n_predict < 0) { + params.n_predict = 16; + } + auto lparams = llama_context_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_parts = params.n_parts; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; auto n_past = 0; - auto last_n_tokens_data = vector(params.repeat_last_n, 0); + auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); // init auto ctx = llama_init_from_file(params.model.c_str(), lparams); - auto tokens = vector(params.n_ctx); + auto tokens = std::vector(params.n_ctx); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true); if (n_prompt_tokens < 1) { @@ -43,23 +44,25 @@ int main(int argc, char ** argv) { } // evaluate prompt - llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads); last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens); n_past += n_prompt_tokens; + const size_t state_size = llama_get_state_size(ctx); + uint8_t * state_mem = new uint8_t[state_size]; + // Save state (rng, logits, embedding and kv_cache) to file - FILE *fp_write = fopen("dump_state.bin", "wb"); - auto state_size = llama_get_state_size(ctx); - auto state_mem = new uint8_t[state_size]; - llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file - fwrite(state_mem, 1, state_size, fp_write); - fclose(fp_write); + { + FILE *fp_write = fopen("dump_state.bin", "wb"); + llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file + fwrite(state_mem, 1, state_size, fp_write); + fclose(fp_write); + } // save state (last tokens) - auto last_n_tokens_data_saved = vector(last_n_tokens_data); - auto n_past_saved = n_past; + const auto last_n_tokens_data_saved = std::vector(last_n_tokens_data); + const auto n_past_saved = n_past; // first run printf("\n%s", params.prompt.c_str()); @@ -75,6 +78,7 @@ int main(int argc, char ** argv) { auto next_token = llama_sample_token(ctx, &candidates_p); auto next_token_str = llama_token_to_str(ctx, next_token); last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); @@ -88,18 +92,21 @@ int main(int argc, char ** argv) { llama_free(ctx); // load new model - auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); // Load state (rng, logits, embedding and kv_cache) from file - FILE *fp_read = fopen("dump_state.bin", "rb"); - auto state_size2 = llama_get_state_size(ctx2); - if (state_size != state_size2) { - fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + { + FILE *fp_read = fopen("dump_state.bin", "rb"); + if (state_size != llama_get_state_size(ctx2)) { + fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + return 1; + } + fread(state_mem, 1, state_size, fp_read); + llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file + fclose(fp_read); } - fread(state_mem, 1, state_size, fp_read); - llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file - fclose(fp_read); + + delete[] state_mem; // restore state (last tokens) last_n_tokens_data = last_n_tokens_data_saved; @@ -118,6 +125,7 @@ int main(int argc, char ** argv) { auto next_token = llama_sample_token(ctx2, &candidates_p); auto next_token_str = llama_token_to_str(ctx2, next_token); last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); @@ -125,6 +133,8 @@ int main(int argc, char ** argv) { } n_past += 1; } + printf("\n\n"); + return 0; } diff --git a/llama_util.h b/llama-util.h old mode 100755 new mode 100644 similarity index 99% rename from llama_util.h rename to llama-util.h index 6e66d12..ca4dd16 --- a/llama_util.h +++ b/llama-util.h @@ -430,5 +430,4 @@ struct llama_ctx_buffer { typedef llama_buffer llama_ctx_buffer; #endif - #endif diff --git a/llama.cpp b/llama.cpp index 1032fb9..dc4bdc5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5,7 +5,7 @@ #include #endif -#include "llama_util.h" +#include "llama-util.h" #include "llama.h" #include "ggml.h" @@ -33,7 +33,6 @@ #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 - // available llama models enum e_model { MODEL_UNKNOWN,