glm4/intel_device_demo/openvino/convert.py

"""
This script is used to convert the original model to OpenVINO IR format.
The Origin Code can check https://github.com/OpenVINO-dev-contest/chatglm3.openvino/blob/main/convert.py
"""
from transformers import AutoTokenizer, AutoConfig
from optimum.intel import OVWeightQuantizationConfig
from optimum.intel.openvino import OVModelForCausalLM

import os
from pathlib import Path
import argparse


if __name__ == '__main__':
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument('-h',
                        '--help',
                        action='help',
                        help='Show this help message and exit.')
    parser.add_argument('-m',
                        '--model_id',
                        default='THUDM/glm-4-9b-chat',
                        required=False,
                        type=str,
                        help='orignal model path')
    parser.add_argument('-p',
                        '--precision',
                        required=False,
                        default="int4",
                        type=str,
                        choices=["fp16", "int8", "int4"],
                        help='fp16, int8 or int4')
    parser.add_argument('-o',
                        '--output',
                        default='./glm-4-9b-ov',
                        required=False,
                        type=str,
                        help='Required. path to save the ir model')
    args = parser.parse_args()

    ir_model_path = Path(args.output)
    if ir_model_path.exists() == False:
        os.mkdir(ir_model_path)

    model_kwargs = {
        "trust_remote_code": True,
        "config": AutoConfig.from_pretrained(args.model_id, trust_remote_code=True),
    }
    compression_configs = {
        "sym": False,
        "group_size": 128,
        "ratio": 0.8,
    }

    print("====Exporting IR=====")
    if args.precision == "int4":
        ov_model = OVModelForCausalLM.from_pretrained(args.model_id, export=True,
                                                      compile=False, quantization_config=OVWeightQuantizationConfig(
                                                          bits=4, **compression_configs), **model_kwargs)
    elif args.precision == "int8":
        ov_model = OVModelForCausalLM.from_pretrained(args.model_id, export=True,
                                                      compile=False, load_in_8bit=True, **model_kwargs)
    else:
        ov_model = OVModelForCausalLM.from_pretrained(args.model_id, export=True,
                                                      compile=False, load_in_8bit=False, **model_kwargs)

    ov_model.save_pretrained(ir_model_path)

    print("====Exporting tokenizer=====")
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_id, trust_remote_code=True)
    tokenizer.save_pretrained(ir_model_path)