From 70269cae37538461ff816e714afbb3ebcdcdc26b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 May 2023 14:54:59 +0300 Subject: [PATCH] llama : fix session load / save (#1263) --- examples/main/main.cpp | 20 +++---- llama.cpp | 133 ++++++++++++++++++++++++----------------- llama.h | 12 ++-- 3 files changed, 96 insertions(+), 69 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 990d0fa..78fc9a1 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -161,23 +161,22 @@ int main(int argc, char ** argv) { std::vector session_tokens; if (!path_session.empty()) { - fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str()); + fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); - // REVIEW - fopen to check for existing session + // fopen to check for existing session FILE * fp = std::fopen(path_session.c_str(), "rb"); if (fp != NULL) { std::fclose(fp); session_tokens.resize(params.n_ctx); size_t n_token_count_out = 0; - const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out); + if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } session_tokens.resize(n_token_count_out); - if (n_session_bytes > 0) { - fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes); - } else { - fprintf(stderr, "%s: could not load session file, will recreate\n", __func__); - } + fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); } else { fprintf(stderr, "%s: session file does not exist, will create\n", __func__); } @@ -214,7 +213,7 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { params.n_keep = (int)embd_inp.size(); } @@ -329,7 +328,7 @@ int main(int argc, char ** argv) { // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); - // REVIEW - stop saving session if we run out of context + // stop saving session if we run out of context path_session = ""; //printf("\n---\n"); @@ -355,6 +354,7 @@ int main(int argc, char ** argv) { n_session_consumed++; if (n_session_consumed >= (int) session_tokens.size()) { + ++i; break; } } diff --git a/llama.cpp b/llama.cpp index 0d094a5..868a58a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2566,6 +2566,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { return nread; } +bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(path_session, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) { + fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + + llama_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(llama_hparams)); + + if (session_hparams != ctx->model.hparams) { + fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + return false; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t n_state_size_cur = file.size - file.tell(); + const size_t n_state_size_exp = llama_get_state_size(ctx); + + if (n_state_size_cur != n_state_size_exp) { + fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur); + return false; + } + + std::vector state_data(n_state_size_cur); + file.read_raw(state_data.data(), n_state_size_cur); + + llama_set_state_data(ctx, state_data.data()); + } + + return true; +} + +bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + llama_file file(path_session, "wb"); + + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); + + file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state + { + const size_t n_state_size = llama_get_state_size(ctx); + + std::vector state_data(n_state_size); + llama_copy_state_data(ctx, state_data.data()); + + file.write_raw(state_data.data(), n_state_size); + } + + return true; +} + int llama_eval( struct llama_context * ctx, const llama_token * tokens, @@ -2693,57 +2772,3 @@ const char * llama_print_system_info(void) { std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } - -size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - // TODO leverage mmap - llama_file file(path_session, "rb"); - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); - - if (!(magic == 'ggsn' && version == 0)) { - fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return 0; - } - - llama_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_hparams)); - - // REVIEW - if (session_hparams != ctx->model.hparams) { - fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); - return 0; - } - - const uint32_t n_token_count = file.read_u32(); - LLAMA_ASSERT(n_token_capacity >= n_token_count); - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - - const size_t n_state_size = file.size - file.tell(); - const size_t n_orig_state_size = llama_get_state_size(ctx); - if (n_state_size != n_orig_state_size) { - fprintf(stderr, "%s : failed to validate state size\n", __func__); - } - std::unique_ptr state_data(new uint8_t[n_state_size]); - file.read_raw(state_data.get(), n_state_size); - return llama_set_state_data(ctx, state_data.get()); -} - -size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - // TODO save temp & swap - llama_file file(path_session, "wb"); - - const size_t n_state_size = llama_get_state_size(ctx); - std::unique_ptr state_data(new uint8_t[n_state_size]); - llama_copy_state_data(ctx, state_data.get()); - - file.write_u32('ggsn'); // magic - file.write_u32(0); // version - file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); - - file.write_u32((uint32_t) n_token_count); // REVIEW - file.write_raw(tokens, sizeof(llama_token) * n_token_count); - - file.write_raw(state_data.get(), n_state_size); - return n_state_size; // REVIEW -} diff --git a/llama.h b/llama.h index 9fbba76..2f6ce8d 100644 --- a/llama.h +++ b/llama.h @@ -19,9 +19,11 @@ # define LLAMA_API #endif -#define LLAMA_FILE_VERSION 1 -#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex -#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files +#define LLAMA_FILE_VERSION 1 +#define LLAMA_FILE_MAGIC 'ggjt' +#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' +#define LLAMA_SESSION_MAGIC 'ggsn' +#define LLAMA_SESSION_VERSION 0 #ifdef __cplusplus extern "C" { @@ -138,8 +140,8 @@ extern "C" { LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); // Save/load session file - LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process