diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index 7d46115..df42e76 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -1,4 +1,4 @@ -# Convert a LLaMA model checkpoint to a ggml compatible file +# Convert a LLaMA model checkpoint to a ggjt compatible file # # Load the model using Torch # Iterate over all variables and write them to a binary file. @@ -52,8 +52,8 @@ GGML_BLCK_SIZE = { } GGML_TYPE_SIZE = { - GGML_TYPE_Q4_0: 4 + QK/2, - GGML_TYPE_Q4_1: 4*2 + QK/2, + GGML_TYPE_Q4_0: 4 + QK//2, + GGML_TYPE_Q4_1: 4*2 + QK//2, GGML_TYPE_I8: 1, GGML_TYPE_I16: 2, GGML_TYPE_I32: 4, @@ -245,11 +245,9 @@ def main(): fname_model = f"{dir_model}/consolidated.00.pth" fname_out = f"{dir_model}/ggml-vocab.bin" print(f"Extracting only the vocab from '{fname_model}'\n") - model = torch.load(fname_model, map_location="cpu") with open(fname_out, "wb") as fout: write_header(fout, hparams, ftype) write_tokens(fout, tokenizer) - del model print(f"Done. Output file: {fname_out}\n") return diff --git a/llama.cpp b/llama.cpp index 28e885c..bed2420 100644 --- a/llama.cpp +++ b/llama.cpp @@ -347,14 +347,15 @@ static void munmap_file(void * addr, size_t length) { #endif } -static bool report_bad_magic(const char *path) { +static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) { fprintf(stderr, - "%s: invalid model file (bad magic)\n" - "you most likely need to regenerate your ggml files\n" - "the benefit is you'll get 10-100x faster load times\n" - "see https://github.com/ggerganov/llama.cpp/issues/91\n" - "use convert-pth-to-ggml.py on your llama model files\n", - path); + "%s: invalid model file (bad magic [got %#x want %#x])\n" + "\tyou most likely need to regenerate your ggml files\n" + "\tthe benefit is you'll get 10-100x faster load times\n" + "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n" + "\tuse convert-pth-to-ggml.py to regenerate from original pth\n" + "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n", + path, got, want); return false; } @@ -397,7 +398,7 @@ static bool llama_model_load( return false; } if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname.c_str()); + return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC); } uint32_t format_version; @@ -1312,7 +1313,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s return false; } if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname_inp.c_str()); + return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC); } fout.write((char *) &magic, sizeof(magic)); diff --git a/migrate-ggml-2023-03-30-pr613.py b/migrate-ggml-2023-03-30-pr613.py new file mode 100644 index 0000000..5596f6c --- /dev/null +++ b/migrate-ggml-2023-03-30-pr613.py @@ -0,0 +1,313 @@ +# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic +# +# We caused a breaking change to the file format on 2023-03-30 in: +# https://github.com/ggerganov/llama.cpp/pull/613 +# +# (1) If you still have the Meta LLaMA .pth files, then close this +# file now; you can just run `convert-pth-to-ggml.py` again to +# migrate to the new format. The tool is easier to use too. It +# isn't necessary anymore to manage split output files because +# the new format always combines things into a single file. +# +# (2) If you deleted the Meta LLaMA .pth files due to save on disk +# space, then this tool is intended to help you. Please check +# out the instructions below. +# +# USAGE +# +# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT +# +# PREREQUISITES +# +# pip install numpy +# cd llama.cpp +# make -j4 +# +# EXAMPLE (7B MODEL) +# +# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights +# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin +# +# # check that it works +# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' +# +# # you can delete the old files +# rm -f models/7B/ggml-model-f16.bin +# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin +# +# EXAMPLE (13B MODEL) +# +# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights +# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin +# +# # check that it works +# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' +# +# # you can delete the old files +# rm -f models/13B/ggml-model-f16.bin* +# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin +# + +import argparse +import os +import sys +import json +import struct +import numpy as np + +QK = 32 + +GGML_TYPE_Q4_0 = 0 +GGML_TYPE_Q4_1 = 1 +GGML_TYPE_I8 = 2 +GGML_TYPE_I16 = 3 +GGML_TYPE_I32 = 4 +GGML_TYPE_F16 = 5 +GGML_TYPE_F32 = 6 + +WTYPE_NAMES = { + 0: "F32", + 1: "F16", + 2: "Q4_0", + 3: "Q4_1", +} + +WTYPES = { + 0: GGML_TYPE_F32, + 1: GGML_TYPE_F16, + 2: GGML_TYPE_Q4_0, + 3: GGML_TYPE_Q4_1, +} + +GGML_BLCK_SIZE = { + GGML_TYPE_Q4_0: QK, + GGML_TYPE_Q4_1: QK, + GGML_TYPE_I8: 1, + GGML_TYPE_I16: 1, + GGML_TYPE_I32: 1, + GGML_TYPE_F16: 1, + GGML_TYPE_F32: 1, +} + +GGML_TYPE_SIZE = { + GGML_TYPE_Q4_0: 4 + QK//2, + GGML_TYPE_Q4_1: 4*2 + QK//2, + GGML_TYPE_I8: 1, + GGML_TYPE_I16: 2, + GGML_TYPE_I32: 4, + GGML_TYPE_F16: 2, + GGML_TYPE_F32: 4, +} + +HPARAMS = [ + 'magic', # int32 + 'version', # int32 + 'n_vocab', # int32 + 'n_embd', # int32 + 'n_mult', # int32 + 'n_head', # int32 + 'n_layer', # int32 + 'n_rot', # int32 + 'f16', # int32 +] + +def read_hparams(fin): + struct_fmt = "i" * len(HPARAMS) + struct_size = struct.calcsize(struct_fmt) + buf = fin.read(struct_size) + ints = struct.unpack(struct_fmt, buf) + hparams = dict(zip(HPARAMS, ints)) + return hparams + +def write_hparams(fout, hparams): + struct_fmt = "i" * len(HPARAMS) + struct_size = struct.calcsize(struct_fmt) + ints = [hparams[h] for h in HPARAMS] + fout.write(struct.pack(struct_fmt, *ints)) + +def read_tokens(fin, hparams): + tokens = [] + for i in range(hparams['n_vocab']): + len_b = fin.read(4) + (length,) = struct.unpack("i", len_b) + word = fin.read(length) + score_b = fin.read(4) + (score,) = struct.unpack("f", score_b) + tokens.append((word, score)) + return tokens + +def write_tokens(fout, tokens): + for word, score in tokens: + fout.write(struct.pack("i", len(word))) + fout.write(word) + fout.write(struct.pack("f", score)) + +def ggml_nelements(shape): + r = 1 + for i in shape: + r *= i + return r + +def ggml_nbytes(shape, ftype): + x = ggml_nelements(shape) + t = WTYPES[ftype] + x *= GGML_TYPE_SIZE[t] + x //= GGML_BLCK_SIZE[t] + return x + +def copy_tensors(fin, fout, part_id, n_parts): + while True: + + b = fin.read(4) + if not b: break + (n_dims,) = struct.unpack("i", b) + b = fin.read(4) + (length,) = struct.unpack("i", b) + b = fin.read(4) + (ftype,) = struct.unpack("i", b) + + assert n_dims in (1, 2) + + partshape = list(range(n_dims)) + for i in range(n_dims): + b = fin.read(4) + partshape[i] = struct.unpack("i", b)[0] + partshape = list(reversed(partshape)) + + name = fin.read(length) + data = fin.read(ggml_nbytes(partshape, ftype)) + + blck_size = GGML_BLCK_SIZE[WTYPES[ftype]] + type_size = GGML_TYPE_SIZE[WTYPES[ftype]] + + print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}") + + # determine dimension along which multipart tensor is sharded + # + # split_dim 0 regex: + # - output.* + # - layers.*.attention.wq.weight + # - layers.*.attention.wk.weight + # - layers.*.attention.wv.weight + # - layers.*.feed_forward.w1.weight + # - layers.*.feed_forward.w3.weight + # + # split_dim 1 regex: + # - tok_embeddings.* + # - layers.*.attention.wo.weight + # - layers.*.feed_forward.w2.weight + # + if n_dims > 1: + split_dim = 1 + if b"tok_embeddings" in name: + split_dim = 1 + elif b"layers" in name: + if b"attention.wo.weight" in name: + split_dim = 1 + elif b"feed_forward.w2.weight" in name: + split_dim = 1 + else: + split_dim = 0 + elif b"output" in name: + split_dim = 0 + + # output tensor header + fullshape = list(partshape) + if n_dims > 1: + fullshape[split_dim] *= n_parts + fout.write(struct.pack("iii", n_dims, len(name), ftype)) + for dim in reversed(fullshape): + fout.write(struct.pack("i", dim)) + fout.write(name) + + # ensure tensor data is aligned + tensor_data_offset = fout.tell() + while tensor_data_offset % QK != 0: + fout.write(struct.pack("B", 0)) + tensor_data_offset += 1 + + # output unified mappable tensor data + if n_dims == 1 or n_parts == 1: + # copy tensor which we thankfully received in one piece + if part_id == 0: + fout.write(data) + elif split_dim == 0: + # reassemble multifile tensor containing some of the rows + rows_per_chunk = partshape[0] + current_row = part_id * rows_per_chunk + bytes_per_row = fullshape[1] // blck_size * type_size + offset = current_row * bytes_per_row + fout.seek(tensor_data_offset + offset) + fout.write(data) + elif split_dim == 1: + # reassemble multifile tensor containing some of the cols + cols_per_chunk = partshape[1] + current_col = part_id * cols_per_chunk + bpr = partshape[1] // blck_size * type_size + bytes_per_row = fullshape[1] // blck_size * type_size + offset_current_col = current_col // blck_size * type_size + for row in range(partshape[0]): + offset_row = row * bytes_per_row + offset = offset_row + offset_current_col + fout.seek(tensor_data_offset + offset) + fout.write(data[row * bpr:row * bpr + bpr]) + + # advance file position to next tensor + fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype)) + +def parse_args(): + parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format') + parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)') + parser.add_argument('fout_path', help='your new ggjt file name') + return parser.parse_args() + +def main(): + args = parse_args() + assert args.fin_path + assert args.fout_path + assert args.fin_path != args.fout_path + + with open(args.fin_path, "rb") as fin: + hparams = read_hparams(fin) + tokens = read_tokens(fin, hparams) + + if hparams['magic'] == 0x67676a74: # ggjt + print("%s: input ggml has already been converted to 'ggjt' magic\n" % + (args.fin_path)) + sys.exit(1) + + if hparams['magic'] != 0x67676d66: # ggmf + print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" % + (args.fin_path, hparams['magic'])) + sys.exit(1) + + hparams['magic'] = 0x67676a74 # ggjt + + # count number of multipart files by convention + n_parts = 1 + while True: + if os.path.exists("%s.%d" % (args.fin_path, n_parts)): + n_parts += 1 + else: + break + + # we output a single file for ggml + with open(args.fout_path, "wb") as fout: + write_hparams(fout, hparams) + write_tokens(fout, tokens) + offset_of_tensors = fout.tell() + # the tensors we load could be split across multiple files + for part_id in range(n_parts): + fout.seek(offset_of_tensors) + print(f"Processing part {part_id+1} of {n_parts}\n") + fin_path = args.fin_path + if part_id > 0: + fin_path += ".%d" % (part_id) + with open(fin_path, "rb") as fin: + read_tokens(fin, read_hparams(fin)) + copy_tensors(fin, fout, part_id, n_parts) + + print(f"Done. Output file: {args.fout_path}\n") + +if __name__ == "__main__": + main()