From a2e501f43e2f18e7c8b5e0200430e3ef061c254c Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Mon, 17 Jun 2024 00:10:02 +0800 Subject: [PATCH] Update with finetune readme 128K and 1M context --- finetune_demo/README.md | 8 +- finetune_demo/README_en.md | 16 ++-- intel_device_demo/itrex/README.md | 95 ++++++++++++++++++++ intel_device_demo/itrex/README_en.md | 94 ++++++++++++++++++++ intel_device_demo/itrex/itrex_cli_demo.py | 102 ++++++++++++++++++++++ intel_device_demo/itrex/requirements.txt | 4 + 6 files changed, 310 insertions(+), 9 deletions(-) create mode 100644 intel_device_demo/itrex/README.md create mode 100644 intel_device_demo/itrex/README_en.md create mode 100644 intel_device_demo/itrex/itrex_cli_demo.py create mode 100644 intel_device_demo/itrex/requirements.txt diff --git a/finetune_demo/README.md b/finetune_demo/README.md index 022c2f3..0fe5fb4 100644 --- a/finetune_demo/README.md +++ b/finetune_demo/README.md @@ -11,7 +11,7 @@ Read this in [English](README_en.md) + OS: Ubuntu 22.04 + Memory: 512GB -+ Python: 3.10.12 / 3.12.3 (如果您使用 Python 3.12.3 目前需要使用 git 源码安装 nltk) ++ Python: 3.10.12 / 3.12.3 (如果您使用 Python 3.12.3 目前需要使用 git 源码安装 nltk) + CUDA Version: 12.3 + GPU Driver: 535.104.05 + GPU: NVIDIA A100-SXM4-80GB * 8 @@ -24,7 +24,8 @@ Read this in [English](README_en.md) 在开始微调之前,请你先安装`basic_demo`中的依赖,同时您需要安装本目录下的依赖项: -> NOTE: NLTK 3.8.1 部分代码可能尚未对 python 3.12 进行适配,该情况下的适配方法可参考[issues #38](https://github.com/THUDM/GLM-4/issues/38) +> NOTE: NLTK 3.8.1 部分代码可能尚未对 python 3.12 +> 进行适配,该情况下的适配方法可参考[issues #38](https://github.com/THUDM/GLM-4/issues/38) ```bash pip install -r requirements.txt @@ -52,7 +53,7 @@ pip install -r requirements.txt "": "" } } - // Add more tools if needed + // Add more tools if needed ] }, { @@ -227,6 +228,7 @@ def load_model_and_tokenizer( 2. 读取微调的模型,请注意,你应该使用微调模型的位置,例如,若你的模型位置为`/path/to/finetune_adapter_model` ,原始模型地址为`path/to/base_model`,则你应该使用`/path/to/finetune_adapter_model`作为`model_dir`。 3. 完成上述操作后,就能正常使用微调的模型了,其他的调用方式没有变化。 +4. 本微调脚本没有测试过128K 1M等长文本的微调,长文本的微调需要更大显存的GPU设备,并且需要更高效的微调方案,需要开发者自行解决。 ## 参考文献 diff --git a/finetune_demo/README_en.md b/finetune_demo/README_en.md index ee54bb3..206953f 100644 --- a/finetune_demo/README_en.md +++ b/finetune_demo/README_en.md @@ -12,7 +12,8 @@ Test hardware information: + OS: Ubuntu 22.04 + Memory: 512GB -+ Python: Python: 3.10.12 / 3.12.3 (Currently, you need to install nltk from the git source code if you use Python 3.12.3) ++ Python: Python: 3.10.12 / 3.12.3 (Currently, you need to install nltk from the git source code if you use Python + 3.12.3) + CUDA Version: 12.3 + GPU Driver: 535.104.05 + GPU: NVIDIA A100-SXM4-80GB * 8 @@ -26,7 +27,8 @@ Test hardware information: Before starting fine-tuning, please install the dependencies in `basic_demo` first. You also need to install the dependencies in this directory: -> NOTE: Some codes in NLTK 3.8.1 might not yet be compatible with Python 3.12. For adaptation methods in such cases, please refer to [issues #38](https://github.com/THUDM/GLM-4/issues/38). +> NOTE: Some codes in NLTK 3.8.1 might not yet be compatible with Python 3.12. For adaptation methods in such cases, +> please refer to [issues #38](https://github.com/THUDM/GLM-4/issues/38). ```bash pip install -r requirements.txt @@ -171,7 +173,7 @@ OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8 finetune_h Execute **single machine single card** run through the following code. ```shell -python finetune_hf.py data/AdvertiseGen/ THUDM/glm-4-9b-chat configs/lora.yaml +python finetune.py data/AdvertiseGen/ THUDM/glm-4-9b-chat configs/lora.yaml ``` ## Fine-tune from a saved point @@ -186,7 +188,7 @@ half-trained model, you can add a fourth parameter, which can be passed in two w For example, this is an example code to continue fine-tuning from the last saved point ```shell -python finetune_hf.py data/AdvertiseGen/ THUDM/glm-4-9b-chat configs/lora.yaml yes +python finetune.py data/AdvertiseGen/ THUDM/glm-4-9b-chat configs/lora.yaml yes ``` ## Use the fine-tuned model @@ -217,9 +219,9 @@ to the following tutorial. def load_model_and_tokenizer( model_dir: Union[str, Path], trust_remote_code: bool = True ) -> tuple[ModelType, TokenizerType]: - - model_dir = _resolve_path(model_dir) + + if (model_dir / 'adapter_config.json').exists(): model = AutoPeftModelForCausalLM.from_pretrained( model_dir, trust_remote_code=trust_remote_code, device_map='auto' @@ -242,6 +244,8 @@ return model, tokenizer as `model_dir`. 3. After completing the above operations, you can use the fine-tuned model normally. Other calling methods remain unchanged. +4. This fine-tuning script has not been tested on long texts of 128K or 1M tokens. Fine-tuning long texts requires GPU + devices with larger memory and more efficient fine-tuning solutions, which developers need to handle on their own. ## Reference diff --git a/intel_device_demo/itrex/README.md b/intel_device_demo/itrex/README.md new file mode 100644 index 0000000..a402214 --- /dev/null +++ b/intel_device_demo/itrex/README.md @@ -0,0 +1,95 @@ +# 使用 Intel® Extension for Transformers 推理 GLM-4-9B-Chat 模型 + +本示例介绍如何使用 Intel® Extension for Transformers 推理 GLM-4-9B-Chat 模型。 + +## 设备和依赖检查 + +### 相关推理测试数据 + +**本文档的数据均在以下硬件环境测试,实际运行环境需求和运行占用的显存略有不同,请以实际运行环境为准。** + +测试硬件信息: + ++ OS: Ubuntu 22.04 (本教程一定需要在Linux环境下执行) ++ Memory: 512GB ++ Python: 3.10.12 ++ CPU: Intel(R) Xeon(R) Platinum 8358 CPU / 12th Gen Intel i5-12400 + +## 安装依赖 + +在开始推理之前,请你先安装`basic_demo`中的依赖,同时您需要安装本目录下的依赖项: +```shell +pip install -r requirements.txt +``` + +## 运行模型推理 + +```shell +python itrex_cli_demo.py +``` + +如果您是第一次推理,会有一次模型转换权重的过程,转换后的模型权重存放在`runtime_outputs`文件夹下,这大概会消耗`60G`的硬盘空间。 +转换完成后,文件夹下有两个文件: ++ ne_chatglm2_f32.bin 52G(如果您不使用FP32进行推理,可以删掉这个文件) ++ ne_chatglm2_q_nf4_bestla_cfp32_sym_sfp32_g32.bin 8.1G + +如果您不是第一次推理,则会跳过这个步骤,直接开始对话,推理效果如下: +```shell +Welcome to the CLI chat. Type your messages below. + +User: 你好 +AVX:1 AVX2:1 AVX512F:1 AVX512BW:1 AVX_VNNI:0 AVX512_VNNI:1 AMX_INT8:0 AMX_BF16:0 AVX512_BF16:0 AVX512_FP16:0 +beam_size: 1, do_sample: 1, top_k: 40, top_p: 0.900, continuous_batching: 0, max_request_num: 1, early_stopping: 0, scratch_size_ratio: 1.000 +model_file_loader: loading model from runtime_outs/ne_chatglm2_q_nf4_bestla_cfp32_sym_sfp32_g32.bin +Loading the bin file with NE format... +load_ne_hparams 0.hparams.n_vocab = 151552 +load_ne_hparams 1.hparams.n_embd = 4096 +load_ne_hparams 2.hparams.n_mult = 0 +load_ne_hparams 3.hparams.n_head = 32 +load_ne_hparams 4.hparams.n_head_kv = 0 +load_ne_hparams 5.hparams.n_layer = 40 +load_ne_hparams 6.hparams.n_rot = 0 +load_ne_hparams 7.hparams.ftype = 0 +load_ne_hparams 8.hparams.max_seq_len = 131072 +load_ne_hparams 9.hparams.alibi_bias_max = 0.000 +load_ne_hparams 10.hparams.clip_qkv = 0.000 +load_ne_hparams 11.hparams.par_res = 0 +load_ne_hparams 12.hparams.word_embed_proj_dim = 0 +load_ne_hparams 13.hparams.do_layer_norm_before = 0 +load_ne_hparams 14.hparams.multi_query_group_num = 2 +load_ne_hparams 15.hparams.ffn_hidden_size = 13696 +load_ne_hparams 16.hparams.inner_hidden_size = 0 +load_ne_hparams 17.hparams.n_experts = 0 +load_ne_hparams 18.hparams.n_experts_used = 0 +load_ne_hparams 19.hparams.n_embd_head_k = 0 +load_ne_hparams 20.hparams.norm_eps = 0.000000 +load_ne_hparams 21.hparams.freq_base = 5000000.000 +load_ne_hparams 22.hparams.freq_scale = 1.000 +load_ne_hparams 23.hparams.rope_scaling_factor = 0.000 +load_ne_hparams 24.hparams.original_max_position_embeddings = 0 +load_ne_hparams 25.hparams.use_yarn = 0 +load_ne_vocab 26.vocab.bos_token_id = 1 +load_ne_vocab 27.vocab.eos_token_id = 151329 +load_ne_vocab 28.vocab.pad_token_id = 151329 +load_ne_vocab 29.vocab.sep_token_id = -1 +init: hparams.n_vocab = 151552 +init: hparams.n_embd = 4096 +init: hparams.n_mult = 0 +init: hparams.n_head = 32 +init: hparams.n_layer = 40 +init: hparams.n_rot = 0 +init: hparams.ffn_hidden_size = 13696 +init: n_parts = 1 +load: ctx size = 16528.38 MB +load: layers[0].ffn_fusion = 1 +load: scratch0 = 4096.00 MB +load: scratch1 = 2048.00 MB +load: scratch2 = 4096.00 MB +load: mem required = 26768.38 MB (+ memory per state) +............................................................................................. +model_init_from_file: support_bestla_kv = 1 +kv_cache_init: run_mha_reordered = 1 +model_init_from_file: kv self size = 690.00 MB +Assistant: +你好👋!我是人工智能助手,很高兴为你服务。有什么可以帮助你的吗? +``` diff --git a/intel_device_demo/itrex/README_en.md b/intel_device_demo/itrex/README_en.md new file mode 100644 index 0000000..c945445 --- /dev/null +++ b/intel_device_demo/itrex/README_en.md @@ -0,0 +1,94 @@ + +# Using Intel® Extension for Transformers to Inference the GLM-4-9B-Chat Model + +This example introduces how to use Intel® Extension for Transformers to inference the GLM-4-9B-Chat model. + +## Device and Dependency Check + +### Relevant Inference Test Data + +**The data in this document is tested on the following hardware environment. The actual running environment requirements and memory usage may vary slightly. Please refer to the actual running environment.** + +Test hardware information: + ++ OS: Ubuntu 22.04 (This tutorial must be executed in a Linux environment) ++ Memory: 512GB ++ Python: 3.10.12 ++ CPU: Intel(R) Xeon(R) Platinum 8358 CPU / 12th Gen Intel i5-12400 + +## Installing Dependencies + +Before starting the inference, please install the dependencies in `basic_demo`, and you need to install the dependencies in this directory: +```shell +pip install -r requirements.txt +``` + +## Running Model Inference + +```shell +python itrex_cli_demo.py +``` + +If this is your first inference, there will be a process of converting model weights. The converted model weights are stored in the `runtime_outputs` folder, which will consume about `60G` of disk space. +After the conversion is completed, there are two files in the folder: ++ ne_chatglm2_f32.bin 52G (If you do not use FP32 for inference, you can delete this file) ++ ne_chatglm2_q_nf4_bestla_cfp32_sym_sfp32_g32.bin 8.1G + +If this is not your first inference, this step will be skipped, and you will directly start the conversation. The inference result is as follows: +```shell +Welcome to the CLI chat. Type your messages below. + +User: Hello +AVX:1 AVX2:1 AVX512F:1 AVX512BW:1 AVX_VNNI:0 AVX512_VNNI:1 AMX_INT8:0 AMX_BF16:0 AVX512_BF16:0 AVX512_FP16:0 +beam_size: 1, do_sample: 1, top_k: 40, top_p: 0.900, continuous_batching: 0, max_request_num: 1, early_stopping: 0, scratch_size_ratio: 1.000 +model_file_loader: loading model from runtime_outs/ne_chatglm2_q_nf4_bestla_cfp32_sym_sfp32_g32.bin +Loading the bin file with NE format... +load_ne_hparams 0.hparams.n_vocab = 151552 +load_ne_hparams 1.hparams.n_embd = 4096 +load_ne_hparams 2.hparams.n_mult = 0 +load_ne_hparams 3.hparams.n_head = 32 +load_ne_hparams 4.hparams.n_head_kv = 0 +load_ne_hparams 5.hparams.n_layer = 40 +load_ne_hparams 6.hparams.n_rot = 0 +load_ne_hparams 7.hparams.ftype = 0 +load_ne_hparams 8.hparams.max_seq_len = 131072 +load_ne_hparams 9.hparams.alibi_bias_max = 0.000 +load_ne_hparams 10.hparams.clip_qkv = 0.000 +load_ne_hparams 11.hparams.multi_query_group_num = 2 +load_ne_hparams 12.hparams.ffn_hidden_size = 13696 +load_ne_hparams 13.hparams.inner_hidden_size = 0 +load_ne_hparams 14.hparams.n_experts = 0 +load_ne_hparams 15.hparams.n_experts_used = 0 +load_ne_hparams 16.hparams.n_embd_head_k = 0 +load_ne_hparams 17.hparams.norm_eps = 0.000000 +load_ne_hparams 18.hparams.freq_base = 5000000.000 +load_ne_hparams 19.hparams.freq_scale = 1.000 +load_ne_hparams 20.hparams.rope_scaling_factor = 0.000 +load_ne_hparams 21.hparams.original_max_position_embeddings = 0 +load_ne_hparams 22.hparams.use_yarn = 0 +load_ne_vocab 23.vocab.bos_token_id = 1 +load_ne_vocab 24.vocab.eos_token_id = 151329 +load_ne_vocab 25.vocab.pad_token_id = 151329 +load_ne_vocab 26.vocab.sep_token_id = -1 +init: hparams.n_vocab = 151552 +init: hparams.n_embd = 4096 +init: hparams.n_mult = 0 +init: hparams.n_head = 32 +init: hparams.n_layer = 40 +init: hparams.n_rot = 0 +init: hparams.ffn_hidden_size = 13696 +init: n_parts = 1 +load: ctx size = 16528.38 MB +load: layers[0].ffn_fusion = 1 +load: scratch0 = 4096.00 MB +load: scratch1 = 2048.00 MB +load: scratch2 = 4096.00 MB +load: mem required = 26768.38 MB (+ memory per state) +............................................................................................. +model_init_from_file: support_bestla_kv = 1 +kv_cache_init: run_mha_reordered = 1 +model_init_from_file: kv self size = 690.00 MB +Assistant: +Hello👋! I am an AI assistant. How can I help you today? +``` + diff --git a/intel_device_demo/itrex/itrex_cli_demo.py b/intel_device_demo/itrex/itrex_cli_demo.py new file mode 100644 index 0000000..feb53b4 --- /dev/null +++ b/intel_device_demo/itrex/itrex_cli_demo.py @@ -0,0 +1,102 @@ +""" +This script creates a CLI demo with transformers backend for the glm-4-9b model with Intel® Extension for Transformers +""" + +import os +MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat') + +import torch +from threading import Thread +from intel_extension_for_transformers.transformers import AutoModelForCausalLM +from transformers import TextIteratorStreamer, StoppingCriteriaList, StoppingCriteria, AutoTokenizer + + +class StopOnTokens(StoppingCriteria): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + stop_ids = [151329, 151336, 151338] + for stop_id in stop_ids: + if input_ids[0][-1] == stop_id: + return True + return False + + +def initialize_model_and_tokenizer(): + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + MODEL_PATH, + device_map="cpu", # Use Intel CPU for inference + trust_remote_code=True, + load_in_4bit=True + ) + return tokenizer, model + + +def get_user_input(): + return input("\nUser: ") + + +def main(): + tokenizer, model = initialize_model_and_tokenizer() + + history = [] + max_length = 100 + top_p = 0.9 + temperature = 0.8 + stop = StopOnTokens() + + print("Welcome to the CLI chat. Type your messages below.") + while True: + user_input = get_user_input() + if user_input.lower() in ["exit", "quit"]: + break + history.append([user_input, ""]) + + messages = [] + for idx, (user_msg, model_msg) in enumerate(history): + if idx == len(history) - 1 and not model_msg: + messages.append({"role": "user", "content": user_msg}) + break + if user_msg: + messages.append({"role": "user", "content": user_msg}) + if model_msg: + messages.append({"role": "assistant", "content": model_msg}) + + model_inputs = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_tensors="pt" + ) + + streamer = TextIteratorStreamer( + tokenizer=tokenizer, + timeout=60, + skip_prompt=True, + skip_special_tokens=True + ) + + generate_kwargs = { + "input_ids": model_inputs, + "streamer": streamer, + "max_new_tokens": max_length, + "do_sample": True, + "top_p": top_p, + "temperature": temperature, + "stopping_criteria": StoppingCriteriaList([stop]), + "repetition_penalty": 1.2, + "eos_token_id": model.config.eos_token_id, + } + + t = Thread(target=model.generate, kwargs=generate_kwargs) + t.start() + print("Assistant:", end="", flush=True) + for new_token in streamer: + if new_token: + print(new_token, end="", flush=True) + history[-1][1] += new_token + + history[-1][1] = history[-1][1].strip() + + +if __name__ == "__main__": + main() diff --git a/intel_device_demo/itrex/requirements.txt b/intel_device_demo/itrex/requirements.txt new file mode 100644 index 0000000..aca32e1 --- /dev/null +++ b/intel_device_demo/itrex/requirements.txt @@ -0,0 +1,4 @@ +cmake>=3.29.5.1 +huggingface-hub>=0.23.4 +git+https://github.com/intel/neural-speed.git@main#egg=neural-speed +intel-extension-for-transformers>=1.4.2