drop quantize.py (now that models are using a single file)

2024-11-09 23:29:44 +00:00 · 2023-03-31 00:52:06 +02:00 · 2023-03-31 00:52:06 +02:00 · 9733104be5
commit 9733104be5
parent 3df890aef4
2 changed files with 2 additions and 133 deletions
--- a/README.md
+++ b/README.md
@ -155,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece
 # convert the 7B model to ggml FP16 format
 python3 convert-pth-to-ggml.py models/7B/ 1
-# quantize the model to 4-bits
+# quantize the model to 4-bits (using method 2 = q4_0)
-python3 quantize.py 7B
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
--- a/quantize.py
+++ b/quantize.py
@ -1,131 +0,0 @@
 #!/usr/bin/env python3
 """Script to execute the "quantize" script on a given set of models."""
 import subprocess
 import argparse
 import glob
 import sys
 import os
 def main():
    """Update the quantize binary name depending on the platform and parse
    the command line arguments and execute the script.
    """
    if "linux" in sys.platform or "darwin" in sys.platform:
        quantize_script_binary = "quantize"
    elif "win32" in sys.platform or "cygwin" in sys.platform:
        quantize_script_binary = "quantize.exe"
    else:
        print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
        quantize_script_binary = "quantize"
    parser = argparse.ArgumentParser(
        prog='python3 quantize.py',
        description='This script quantizes the given models by applying the '
        f'"{quantize_script_binary}" script on them.'
    )
    parser.add_argument(
        'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
        help='The models to quantize.'
    )
    parser.add_argument(
        '-r', '--remove-16', action='store_true', dest='remove_f16',
        help='Remove the f16 model after quantizing it.'
    )
    parser.add_argument(
        '-m', '--models-path', dest='models_path',
        default=os.path.join(os.getcwd(), "models"),
        help='Specify the directory where the models are located.'
    )
    parser.add_argument(
        '-q', '--quantize-script-path', dest='quantize_script_path',
        default=os.path.join(os.getcwd(), quantize_script_binary),
        help='Specify the path to the "quantize" script.'
    )
    # TODO: Revise this code
    # parser.add_argument(
    #     '-t', '--threads', dest='threads', type='int',
    #     default=os.cpu_count(),
    #     help='Specify the number of threads to use to quantize many models at '
    #     'once. Defaults to os.cpu_count().'
    # )
    args = parser.parse_args()
    args.models_path = os.path.abspath(args.models_path)
    if not os.path.isfile(args.quantize_script_path):
        print(
            f'The "{quantize_script_binary}" script was not found in the '
            "current location.\nIf you want to use it from another location, "
            "set the --quantize-script-path argument from the command line."
        )
        sys.exit(1)
    for model in args.models:
        # The model is separated in various parts
        # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
        f16_model_path_base = os.path.join(
            args.models_path, model, "ggml-model-f16.bin"
        )
        if not os.path.isfile(f16_model_path_base):
            print(f'The file %s was not found' % f16_model_path_base)
            sys.exit(1)
        f16_model_parts_paths = map(
            lambda filename: os.path.join(f16_model_path_base, filename),
            glob.glob(f"{f16_model_path_base}*")
        )
        for f16_model_part_path in f16_model_parts_paths:
            if not os.path.isfile(f16_model_part_path):
                print(
                    f"The f16 model {os.path.basename(f16_model_part_path)} "
                    f"was not found in {args.models_path}{os.path.sep}{model}"
                    ". If you want to use it from another location, set the "
                    "--models-path argument from the command line."
                )
                sys.exit(1)
            __run_quantize_script(
                args.quantize_script_path, f16_model_part_path
            )
            if args.remove_f16:
                os.remove(f16_model_part_path)
 # This was extracted to a top-level function for parallelization, if
 # implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
 def __run_quantize_script(script_path, f16_model_part_path):
    """Run the quantize script specifying the path to it and the path to the
    f16 model to quantize.
    """
    new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
    subprocess.run(
        [script_path, f16_model_part_path, new_quantized_model_path, "2"],
        check=True
    )
 if __name__ == "__main__":
    try:
        main()
    except subprocess.CalledProcessError:
        print("\nAn error ocurred while trying to quantize the models.")
        sys.exit(1)
    except KeyboardInterrupt:
        sys.exit(0)
    else:
        print("\nSuccesfully quantized all models.")