diff --git a/Makefile b/Makefile index 1601079..44fb298 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ endif # CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC +CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC LDFLAGS = # OS specific diff --git a/README.md b/README.md index dae1bf1..c7e5d33 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ - Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64 - Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105 +**TEMPORARY NOTICE:** +If you're updating to the latest master, you will need to regenerate your model files as the format has changed. + ## Description The main goal is to run the model using 4-bit quantization on a MacBook diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index c1941a8..42f5377 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype): keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] values = [ - 0x67676d6c, # magic: ggml in hex + 0x67676d66, # magic: ggml in hex + 1, # file version *[hparams[key] for key in keys], hparams["dim"] // hparams["n_heads"], # rot (obsolete) ftype @@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer): text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") fout.write(struct.pack("i", len(text))) fout.write(text) + fout.write(struct.pack("f", tokenizer.get_score(i))) def process_and_write_variables(fout, model, ftype): diff --git a/main.cpp b/main.cpp index c005d17..1590333 100644 --- a/main.cpp +++ b/main.cpp @@ -3,6 +3,7 @@ #include "utils.h" #include +#include #include #include #include @@ -105,10 +106,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != 0x67676d6c) { + if (magic == 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", + __func__, fname.c_str()); + return false; + } + if (magic != 0x67676d66) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); return false; } + + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n", + __func__, fname.c_str(), format_version); + return false; + } } int n_ff = 0; @@ -154,8 +169,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab word.resize(len); fin.read((char *) word.data(), len); + float score; + fin.read((char *) &score, sizeof(score)); + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; + vocab.score[i] = score; //if (i < 30000) { // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); diff --git a/quantize.cpp b/quantize.cpp index 14c7b27..166e916 100644 --- a/quantize.cpp +++ b/quantize.cpp @@ -3,6 +3,7 @@ #include "utils.h" #include +#include #include #include #include @@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna { uint32_t magic; finp.read((char *) &magic, sizeof(magic)); - if (magic != 0x67676d6c) { + if (magic == 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", + __func__, fname_inp.c_str()); + return false; + } + if (magic != 0x67676d66) { fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); return false; } fout.write((char *) &magic, sizeof(magic)); + + uint32_t format_version; + finp.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n", + __func__, fname_inp.c_str(), format_version); + return false; + } + + fout.write((char *) &format_version, sizeof(format_version)); } llama_hparams hparams; @@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna finp.read ((char *) word.data(), len); fout.write((char *) word.data(), len); + float score; + finp.read ((char *) &score, sizeof(score)); + fout.write((char *) &score, sizeof(score)); + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; + vocab.score[i] = score; } } diff --git a/utils.cpp b/utils.cpp index 08d5c6b..188f114 100644 --- a/utils.cpp +++ b/utils.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -294,58 +295,146 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } -// TODO: Calculate this constant from the vocabulary -#define MAX_TOKEN_LEN 18 -// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - std::vector res; - std::vector score; - std::vector prev; - int len = text.length(); +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} - score.resize(len + 1); - prev.resize(len + 1); +struct llama_sp_symbol { + using index = int; + index prev; + index next; + std::string_view text; +}; - // Forward pass - for (int i = 0; i < len; i++) { - int max_len = std::min(len - i, MAX_TOKEN_LEN); - for (int sub_len = 1; sub_len <= max_len; sub_len++) { - auto sub = text.substr(i, sub_len); - auto token = vocab.token_to_id.find(sub); - if (token != vocab.token_to_id.end()) { - int token_score = sub.length() * sub.length(); - int local_score = score[i] + token_score; - int next = i + sub_len; - if (score[next] < local_score) { - score[next] = local_score; - prev[next] = (*token).second; +struct llama_sp_bigram { + struct comparator { + bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { + return (l.score < r.score) || (l.score == r.score && l.left > r.left); + } + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llama_sp_symbol::index left; + llama_sp_symbol::index right; + float score; + size_t size; +}; + +struct llama_tokenizer { + llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {} + + void tokenize(std::string_view text, std::vector & output) { + // split string into utf8 chars + int index = 0; + while (!text.empty()) { + llama_sp_symbol sym; + size_t char_len = std::min(text.size(), utf8_len(text.data()[0])); + sym.text = std::string_view(text.data(), char_len); + sym.prev = index - 1; + text.remove_prefix(char_len); + sym.next = text.empty() ? -1 : index + 1; + index++; + symbols_.emplace_back(std::move(sym)); + } + + // seed the work queue with all possible 2-character tokens. + for (size_t i = 1; i < symbols_.size(); ++i) { + try_add_bigram(i - 1, i); + } + + // keep substituting the highest frequency pairs for as long as we can. + while (!work_queue_.empty()) { + auto bigram = work_queue_.top(); + work_queue_.pop(); + + auto & left_sym = symbols_[bigram.left]; + auto & right_sym = symbols_[bigram.right]; + + // if one of the symbols already got merged, skip it. + if (left_sym.text.empty() || right_sym.text.empty() || + left_sym.text.size() + right_sym.text.size() != bigram.size) { + continue; + } + + // merge the right sym into the left one + left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size()); + right_sym.text = std::string_view(""); + + // remove the right sym from the chain + left_sym.next = right_sym.next; + if (right_sym.next >= 0) { + symbols_[right_sym.next].prev = bigram.left; + } + + // find more substitutions + try_add_bigram(left_sym.prev, bigram.left); + try_add_bigram(bigram.left, left_sym.next); + } + + for (int i = 0; i != -1; i = symbols_[i].next) { + auto& symbol = symbols_[i]; + auto token = vocab_.token_to_id.find(std::string(symbol.text)); + + if (token == vocab_.token_to_id.end()) { + // output any symbols that did not form tokens as bytes. + for (int j = 0; j < symbol.text.size(); ++j) { + gpt_vocab::id token_id = static_cast(symbol.text[j]) + 3; + output.push_back(token_id); } + } else { + output.push_back((*token).second); } } } - // Backward pass - int i = len; - while (i > 0) { - gpt_vocab::id token_id = prev[i]; - if (token_id == 0) { - // TODO: Return error or something more meaningful - printf("failed to tokenize string!\n"); - break; +private: + void try_add_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; } - res.push_back(token_id); - auto token = (*vocab.id_to_token.find(token_id)).second; - i -= token.length(); + + std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size()); + auto token = vocab_.token_to_id.find(std::string(text)); + + if (token == vocab_.token_to_id.end()) { + return; + } + + auto score = vocab_.score.find((*token).second); + + if (score == vocab_.score.end()) { + return; + } + + llama_sp_bigram bigram; + bigram.left = left; + bigram.right = right; + bigram.score = (*score).second; + bigram.size = text.size(); + work_queue_.push(bigram); + } + + const gpt_vocab & vocab_; + std::vector symbols_; + llama_sp_bigram::queue work_queue_; +}; + +std::vector llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) { + llama_tokenizer tokenizer(vocab); + std::vector output; + + if (text.size() == 0) { + return output; } if (bos) { - res.push_back(1); // TODO: replace with vocab.bos + output.push_back(1); } - // Pieces are in reverse order so correct that - std::reverse(res.begin(), res.end()); - - return res; + tokenizer.tokenize(text, output); + return output; } bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { diff --git a/utils.h b/utils.h index 49658f7..b3a0f47 100644 --- a/utils.h +++ b/utils.h @@ -58,6 +58,7 @@ struct gpt_vocab { std::map token_to_id; std::map id_to_token; + std::map score; }; void replace(std::string & str, const std::string & needle, const std::string & replacement); @@ -79,7 +80,7 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +std::vector llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos); // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);