mirror of
https://git.adityakumar.xyz/llama.cpp.git
synced 2024-11-09 23:29:44 +00:00
drop quantize.py (now that models are using a single file)
This commit is contained in:
parent
3df890aef4
commit
9733104be5
2 changed files with 2 additions and 133 deletions
|
@ -155,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece
|
||||||
# convert the 7B model to ggml FP16 format
|
# convert the 7B model to ggml FP16 format
|
||||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
python3 convert-pth-to-ggml.py models/7B/ 1
|
||||||
|
|
||||||
# quantize the model to 4-bits
|
# quantize the model to 4-bits (using method 2 = q4_0)
|
||||||
python3 quantize.py 7B
|
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
||||||
|
|
||||||
# run the inference
|
# run the inference
|
||||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||||
|
|
131
quantize.py
131
quantize.py
|
@ -1,131 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
"""Script to execute the "quantize" script on a given set of models."""
|
|
||||||
|
|
||||||
import subprocess
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Update the quantize binary name depending on the platform and parse
|
|
||||||
the command line arguments and execute the script.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if "linux" in sys.platform or "darwin" in sys.platform:
|
|
||||||
quantize_script_binary = "quantize"
|
|
||||||
|
|
||||||
elif "win32" in sys.platform or "cygwin" in sys.platform:
|
|
||||||
quantize_script_binary = "quantize.exe"
|
|
||||||
|
|
||||||
else:
|
|
||||||
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
|
|
||||||
quantize_script_binary = "quantize"
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog='python3 quantize.py',
|
|
||||||
description='This script quantizes the given models by applying the '
|
|
||||||
f'"{quantize_script_binary}" script on them.'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
|
|
||||||
help='The models to quantize.'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'-r', '--remove-16', action='store_true', dest='remove_f16',
|
|
||||||
help='Remove the f16 model after quantizing it.'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'-m', '--models-path', dest='models_path',
|
|
||||||
default=os.path.join(os.getcwd(), "models"),
|
|
||||||
help='Specify the directory where the models are located.'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'-q', '--quantize-script-path', dest='quantize_script_path',
|
|
||||||
default=os.path.join(os.getcwd(), quantize_script_binary),
|
|
||||||
help='Specify the path to the "quantize" script.'
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: Revise this code
|
|
||||||
# parser.add_argument(
|
|
||||||
# '-t', '--threads', dest='threads', type='int',
|
|
||||||
# default=os.cpu_count(),
|
|
||||||
# help='Specify the number of threads to use to quantize many models at '
|
|
||||||
# 'once. Defaults to os.cpu_count().'
|
|
||||||
# )
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
args.models_path = os.path.abspath(args.models_path)
|
|
||||||
|
|
||||||
if not os.path.isfile(args.quantize_script_path):
|
|
||||||
print(
|
|
||||||
f'The "{quantize_script_binary}" script was not found in the '
|
|
||||||
"current location.\nIf you want to use it from another location, "
|
|
||||||
"set the --quantize-script-path argument from the command line."
|
|
||||||
)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
for model in args.models:
|
|
||||||
# The model is separated in various parts
|
|
||||||
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
|
|
||||||
f16_model_path_base = os.path.join(
|
|
||||||
args.models_path, model, "ggml-model-f16.bin"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not os.path.isfile(f16_model_path_base):
|
|
||||||
print(f'The file %s was not found' % f16_model_path_base)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
f16_model_parts_paths = map(
|
|
||||||
lambda filename: os.path.join(f16_model_path_base, filename),
|
|
||||||
glob.glob(f"{f16_model_path_base}*")
|
|
||||||
)
|
|
||||||
|
|
||||||
for f16_model_part_path in f16_model_parts_paths:
|
|
||||||
if not os.path.isfile(f16_model_part_path):
|
|
||||||
print(
|
|
||||||
f"The f16 model {os.path.basename(f16_model_part_path)} "
|
|
||||||
f"was not found in {args.models_path}{os.path.sep}{model}"
|
|
||||||
". If you want to use it from another location, set the "
|
|
||||||
"--models-path argument from the command line."
|
|
||||||
)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
__run_quantize_script(
|
|
||||||
args.quantize_script_path, f16_model_part_path
|
|
||||||
)
|
|
||||||
|
|
||||||
if args.remove_f16:
|
|
||||||
os.remove(f16_model_part_path)
|
|
||||||
|
|
||||||
|
|
||||||
# This was extracted to a top-level function for parallelization, if
|
|
||||||
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
|
|
||||||
|
|
||||||
def __run_quantize_script(script_path, f16_model_part_path):
|
|
||||||
"""Run the quantize script specifying the path to it and the path to the
|
|
||||||
f16 model to quantize.
|
|
||||||
"""
|
|
||||||
|
|
||||||
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
|
|
||||||
subprocess.run(
|
|
||||||
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
|
|
||||||
check=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
main()
|
|
||||||
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
print("\nAn error ocurred while trying to quantize the models.")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
else:
|
|
||||||
print("\nSuccesfully quantized all models.")
|
|
Loading…
Reference in a new issue