mirror of
https://git.adityakumar.xyz/llama.cpp.git
synced 2024-11-09 23:29:44 +00:00
Introduce GGML migration tool for new file format
If you deleted your old Meta LLaMA .pth files, then the migrate-ggml-2023-03-30-pr613.py script will allow you to convert your old ggml files into the new mmap()'able format. See #613
This commit is contained in:
parent
6f23ba5ee2
commit
ee0c40dd6d
3 changed files with 326 additions and 14 deletions
|
@ -1,4 +1,4 @@
|
||||||
# Convert a LLaMA model checkpoint to a ggml compatible file
|
# Convert a LLaMA model checkpoint to a ggjt compatible file
|
||||||
#
|
#
|
||||||
# Load the model using Torch
|
# Load the model using Torch
|
||||||
# Iterate over all variables and write them to a binary file.
|
# Iterate over all variables and write them to a binary file.
|
||||||
|
@ -52,8 +52,8 @@ GGML_BLCK_SIZE = {
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_TYPE_SIZE = {
|
GGML_TYPE_SIZE = {
|
||||||
GGML_TYPE_Q4_0: 4 + QK/2,
|
GGML_TYPE_Q4_0: 4 + QK//2,
|
||||||
GGML_TYPE_Q4_1: 4*2 + QK/2,
|
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
||||||
GGML_TYPE_I8: 1,
|
GGML_TYPE_I8: 1,
|
||||||
GGML_TYPE_I16: 2,
|
GGML_TYPE_I16: 2,
|
||||||
GGML_TYPE_I32: 4,
|
GGML_TYPE_I32: 4,
|
||||||
|
@ -245,11 +245,9 @@ def main():
|
||||||
fname_model = f"{dir_model}/consolidated.00.pth"
|
fname_model = f"{dir_model}/consolidated.00.pth"
|
||||||
fname_out = f"{dir_model}/ggml-vocab.bin"
|
fname_out = f"{dir_model}/ggml-vocab.bin"
|
||||||
print(f"Extracting only the vocab from '{fname_model}'\n")
|
print(f"Extracting only the vocab from '{fname_model}'\n")
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
|
||||||
with open(fname_out, "wb") as fout:
|
with open(fname_out, "wb") as fout:
|
||||||
write_header(fout, hparams, ftype)
|
write_header(fout, hparams, ftype)
|
||||||
write_tokens(fout, tokenizer)
|
write_tokens(fout, tokenizer)
|
||||||
del model
|
|
||||||
print(f"Done. Output file: {fname_out}\n")
|
print(f"Done. Output file: {fname_out}\n")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
19
llama.cpp
19
llama.cpp
|
@ -347,14 +347,15 @@ static void munmap_file(void * addr, size_t length) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool report_bad_magic(const char *path) {
|
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"%s: invalid model file (bad magic)\n"
|
"%s: invalid model file (bad magic [got %#x want %#x])\n"
|
||||||
"you most likely need to regenerate your ggml files\n"
|
"\tyou most likely need to regenerate your ggml files\n"
|
||||||
"the benefit is you'll get 10-100x faster load times\n"
|
"\tthe benefit is you'll get 10-100x faster load times\n"
|
||||||
"see https://github.com/ggerganov/llama.cpp/issues/91\n"
|
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
|
||||||
"use convert-pth-to-ggml.py on your llama model files\n",
|
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
|
||||||
path);
|
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
|
||||||
|
path, got, want);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -397,7 +398,7 @@ static bool llama_model_load(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (magic != LLAMA_FILE_MAGIC) {
|
if (magic != LLAMA_FILE_MAGIC) {
|
||||||
return report_bad_magic(fname.c_str());
|
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t format_version;
|
uint32_t format_version;
|
||||||
|
@ -1312,7 +1313,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (magic != LLAMA_FILE_MAGIC) {
|
if (magic != LLAMA_FILE_MAGIC) {
|
||||||
return report_bad_magic(fname_inp.c_str());
|
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
|
||||||
}
|
}
|
||||||
|
|
||||||
fout.write((char *) &magic, sizeof(magic));
|
fout.write((char *) &magic, sizeof(magic));
|
||||||
|
|
313
migrate-ggml-2023-03-30-pr613.py
Normal file
313
migrate-ggml-2023-03-30-pr613.py
Normal file
|
@ -0,0 +1,313 @@
|
||||||
|
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
|
||||||
|
#
|
||||||
|
# We caused a breaking change to the file format on 2023-03-30 in:
|
||||||
|
# https://github.com/ggerganov/llama.cpp/pull/613
|
||||||
|
#
|
||||||
|
# (1) If you still have the Meta LLaMA .pth files, then close this
|
||||||
|
# file now; you can just run `convert-pth-to-ggml.py` again to
|
||||||
|
# migrate to the new format. The tool is easier to use too. It
|
||||||
|
# isn't necessary anymore to manage split output files because
|
||||||
|
# the new format always combines things into a single file.
|
||||||
|
#
|
||||||
|
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
|
||||||
|
# space, then this tool is intended to help you. Please check
|
||||||
|
# out the instructions below.
|
||||||
|
#
|
||||||
|
# USAGE
|
||||||
|
#
|
||||||
|
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
|
||||||
|
#
|
||||||
|
# PREREQUISITES
|
||||||
|
#
|
||||||
|
# pip install numpy
|
||||||
|
# cd llama.cpp
|
||||||
|
# make -j4
|
||||||
|
#
|
||||||
|
# EXAMPLE (7B MODEL)
|
||||||
|
#
|
||||||
|
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
||||||
|
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
|
||||||
|
#
|
||||||
|
# # check that it works
|
||||||
|
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
||||||
|
#
|
||||||
|
# # you can delete the old files
|
||||||
|
# rm -f models/7B/ggml-model-f16.bin
|
||||||
|
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
|
||||||
|
#
|
||||||
|
# EXAMPLE (13B MODEL)
|
||||||
|
#
|
||||||
|
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
||||||
|
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
|
||||||
|
#
|
||||||
|
# # check that it works
|
||||||
|
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
||||||
|
#
|
||||||
|
# # you can delete the old files
|
||||||
|
# rm -f models/13B/ggml-model-f16.bin*
|
||||||
|
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import struct
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
QK = 32
|
||||||
|
|
||||||
|
GGML_TYPE_Q4_0 = 0
|
||||||
|
GGML_TYPE_Q4_1 = 1
|
||||||
|
GGML_TYPE_I8 = 2
|
||||||
|
GGML_TYPE_I16 = 3
|
||||||
|
GGML_TYPE_I32 = 4
|
||||||
|
GGML_TYPE_F16 = 5
|
||||||
|
GGML_TYPE_F32 = 6
|
||||||
|
|
||||||
|
WTYPE_NAMES = {
|
||||||
|
0: "F32",
|
||||||
|
1: "F16",
|
||||||
|
2: "Q4_0",
|
||||||
|
3: "Q4_1",
|
||||||
|
}
|
||||||
|
|
||||||
|
WTYPES = {
|
||||||
|
0: GGML_TYPE_F32,
|
||||||
|
1: GGML_TYPE_F16,
|
||||||
|
2: GGML_TYPE_Q4_0,
|
||||||
|
3: GGML_TYPE_Q4_1,
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_BLCK_SIZE = {
|
||||||
|
GGML_TYPE_Q4_0: QK,
|
||||||
|
GGML_TYPE_Q4_1: QK,
|
||||||
|
GGML_TYPE_I8: 1,
|
||||||
|
GGML_TYPE_I16: 1,
|
||||||
|
GGML_TYPE_I32: 1,
|
||||||
|
GGML_TYPE_F16: 1,
|
||||||
|
GGML_TYPE_F32: 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_TYPE_SIZE = {
|
||||||
|
GGML_TYPE_Q4_0: 4 + QK//2,
|
||||||
|
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
||||||
|
GGML_TYPE_I8: 1,
|
||||||
|
GGML_TYPE_I16: 2,
|
||||||
|
GGML_TYPE_I32: 4,
|
||||||
|
GGML_TYPE_F16: 2,
|
||||||
|
GGML_TYPE_F32: 4,
|
||||||
|
}
|
||||||
|
|
||||||
|
HPARAMS = [
|
||||||
|
'magic', # int32
|
||||||
|
'version', # int32
|
||||||
|
'n_vocab', # int32
|
||||||
|
'n_embd', # int32
|
||||||
|
'n_mult', # int32
|
||||||
|
'n_head', # int32
|
||||||
|
'n_layer', # int32
|
||||||
|
'n_rot', # int32
|
||||||
|
'f16', # int32
|
||||||
|
]
|
||||||
|
|
||||||
|
def read_hparams(fin):
|
||||||
|
struct_fmt = "i" * len(HPARAMS)
|
||||||
|
struct_size = struct.calcsize(struct_fmt)
|
||||||
|
buf = fin.read(struct_size)
|
||||||
|
ints = struct.unpack(struct_fmt, buf)
|
||||||
|
hparams = dict(zip(HPARAMS, ints))
|
||||||
|
return hparams
|
||||||
|
|
||||||
|
def write_hparams(fout, hparams):
|
||||||
|
struct_fmt = "i" * len(HPARAMS)
|
||||||
|
struct_size = struct.calcsize(struct_fmt)
|
||||||
|
ints = [hparams[h] for h in HPARAMS]
|
||||||
|
fout.write(struct.pack(struct_fmt, *ints))
|
||||||
|
|
||||||
|
def read_tokens(fin, hparams):
|
||||||
|
tokens = []
|
||||||
|
for i in range(hparams['n_vocab']):
|
||||||
|
len_b = fin.read(4)
|
||||||
|
(length,) = struct.unpack("i", len_b)
|
||||||
|
word = fin.read(length)
|
||||||
|
score_b = fin.read(4)
|
||||||
|
(score,) = struct.unpack("f", score_b)
|
||||||
|
tokens.append((word, score))
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def write_tokens(fout, tokens):
|
||||||
|
for word, score in tokens:
|
||||||
|
fout.write(struct.pack("i", len(word)))
|
||||||
|
fout.write(word)
|
||||||
|
fout.write(struct.pack("f", score))
|
||||||
|
|
||||||
|
def ggml_nelements(shape):
|
||||||
|
r = 1
|
||||||
|
for i in shape:
|
||||||
|
r *= i
|
||||||
|
return r
|
||||||
|
|
||||||
|
def ggml_nbytes(shape, ftype):
|
||||||
|
x = ggml_nelements(shape)
|
||||||
|
t = WTYPES[ftype]
|
||||||
|
x *= GGML_TYPE_SIZE[t]
|
||||||
|
x //= GGML_BLCK_SIZE[t]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def copy_tensors(fin, fout, part_id, n_parts):
|
||||||
|
while True:
|
||||||
|
|
||||||
|
b = fin.read(4)
|
||||||
|
if not b: break
|
||||||
|
(n_dims,) = struct.unpack("i", b)
|
||||||
|
b = fin.read(4)
|
||||||
|
(length,) = struct.unpack("i", b)
|
||||||
|
b = fin.read(4)
|
||||||
|
(ftype,) = struct.unpack("i", b)
|
||||||
|
|
||||||
|
assert n_dims in (1, 2)
|
||||||
|
|
||||||
|
partshape = list(range(n_dims))
|
||||||
|
for i in range(n_dims):
|
||||||
|
b = fin.read(4)
|
||||||
|
partshape[i] = struct.unpack("i", b)[0]
|
||||||
|
partshape = list(reversed(partshape))
|
||||||
|
|
||||||
|
name = fin.read(length)
|
||||||
|
data = fin.read(ggml_nbytes(partshape, ftype))
|
||||||
|
|
||||||
|
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
|
||||||
|
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
|
||||||
|
|
||||||
|
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
|
||||||
|
|
||||||
|
# determine dimension along which multipart tensor is sharded
|
||||||
|
#
|
||||||
|
# split_dim 0 regex:
|
||||||
|
# - output.*
|
||||||
|
# - layers.*.attention.wq.weight
|
||||||
|
# - layers.*.attention.wk.weight
|
||||||
|
# - layers.*.attention.wv.weight
|
||||||
|
# - layers.*.feed_forward.w1.weight
|
||||||
|
# - layers.*.feed_forward.w3.weight
|
||||||
|
#
|
||||||
|
# split_dim 1 regex:
|
||||||
|
# - tok_embeddings.*
|
||||||
|
# - layers.*.attention.wo.weight
|
||||||
|
# - layers.*.feed_forward.w2.weight
|
||||||
|
#
|
||||||
|
if n_dims > 1:
|
||||||
|
split_dim = 1
|
||||||
|
if b"tok_embeddings" in name:
|
||||||
|
split_dim = 1
|
||||||
|
elif b"layers" in name:
|
||||||
|
if b"attention.wo.weight" in name:
|
||||||
|
split_dim = 1
|
||||||
|
elif b"feed_forward.w2.weight" in name:
|
||||||
|
split_dim = 1
|
||||||
|
else:
|
||||||
|
split_dim = 0
|
||||||
|
elif b"output" in name:
|
||||||
|
split_dim = 0
|
||||||
|
|
||||||
|
# output tensor header
|
||||||
|
fullshape = list(partshape)
|
||||||
|
if n_dims > 1:
|
||||||
|
fullshape[split_dim] *= n_parts
|
||||||
|
fout.write(struct.pack("iii", n_dims, len(name), ftype))
|
||||||
|
for dim in reversed(fullshape):
|
||||||
|
fout.write(struct.pack("i", dim))
|
||||||
|
fout.write(name)
|
||||||
|
|
||||||
|
# ensure tensor data is aligned
|
||||||
|
tensor_data_offset = fout.tell()
|
||||||
|
while tensor_data_offset % QK != 0:
|
||||||
|
fout.write(struct.pack("B", 0))
|
||||||
|
tensor_data_offset += 1
|
||||||
|
|
||||||
|
# output unified mappable tensor data
|
||||||
|
if n_dims == 1 or n_parts == 1:
|
||||||
|
# copy tensor which we thankfully received in one piece
|
||||||
|
if part_id == 0:
|
||||||
|
fout.write(data)
|
||||||
|
elif split_dim == 0:
|
||||||
|
# reassemble multifile tensor containing some of the rows
|
||||||
|
rows_per_chunk = partshape[0]
|
||||||
|
current_row = part_id * rows_per_chunk
|
||||||
|
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||||
|
offset = current_row * bytes_per_row
|
||||||
|
fout.seek(tensor_data_offset + offset)
|
||||||
|
fout.write(data)
|
||||||
|
elif split_dim == 1:
|
||||||
|
# reassemble multifile tensor containing some of the cols
|
||||||
|
cols_per_chunk = partshape[1]
|
||||||
|
current_col = part_id * cols_per_chunk
|
||||||
|
bpr = partshape[1] // blck_size * type_size
|
||||||
|
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||||
|
offset_current_col = current_col // blck_size * type_size
|
||||||
|
for row in range(partshape[0]):
|
||||||
|
offset_row = row * bytes_per_row
|
||||||
|
offset = offset_row + offset_current_col
|
||||||
|
fout.seek(tensor_data_offset + offset)
|
||||||
|
fout.write(data[row * bpr:row * bpr + bpr])
|
||||||
|
|
||||||
|
# advance file position to next tensor
|
||||||
|
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
|
||||||
|
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
|
||||||
|
parser.add_argument('fout_path', help='your new ggjt file name')
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
assert args.fin_path
|
||||||
|
assert args.fout_path
|
||||||
|
assert args.fin_path != args.fout_path
|
||||||
|
|
||||||
|
with open(args.fin_path, "rb") as fin:
|
||||||
|
hparams = read_hparams(fin)
|
||||||
|
tokens = read_tokens(fin, hparams)
|
||||||
|
|
||||||
|
if hparams['magic'] == 0x67676a74: # ggjt
|
||||||
|
print("%s: input ggml has already been converted to 'ggjt' magic\n" %
|
||||||
|
(args.fin_path))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if hparams['magic'] != 0x67676d66: # ggmf
|
||||||
|
print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" %
|
||||||
|
(args.fin_path, hparams['magic']))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
hparams['magic'] = 0x67676a74 # ggjt
|
||||||
|
|
||||||
|
# count number of multipart files by convention
|
||||||
|
n_parts = 1
|
||||||
|
while True:
|
||||||
|
if os.path.exists("%s.%d" % (args.fin_path, n_parts)):
|
||||||
|
n_parts += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
# we output a single file for ggml
|
||||||
|
with open(args.fout_path, "wb") as fout:
|
||||||
|
write_hparams(fout, hparams)
|
||||||
|
write_tokens(fout, tokens)
|
||||||
|
offset_of_tensors = fout.tell()
|
||||||
|
# the tensors we load could be split across multiple files
|
||||||
|
for part_id in range(n_parts):
|
||||||
|
fout.seek(offset_of_tensors)
|
||||||
|
print(f"Processing part {part_id+1} of {n_parts}\n")
|
||||||
|
fin_path = args.fin_path
|
||||||
|
if part_id > 0:
|
||||||
|
fin_path += ".%d" % (part_id)
|
||||||
|
with open(fin_path, "rb") as fin:
|
||||||
|
read_tokens(fin, read_hparams(fin))
|
||||||
|
copy_tensors(fin, fout, part_id, n_parts)
|
||||||
|
|
||||||
|
print(f"Done. Output file: {args.fout_path}\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue