From d8828b19fd846774b6c055b97a652f8ebc62fa2d Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Fri, 28 Jun 2024 16:09:51 +0800 Subject: [PATCH] intel openvino support --- README.md | 5 +- README_en.md | 4 +- intel_device_demo/openvino/README.md | 70 ++++++++++ intel_device_demo/openvino/README_en.md | 70 ++++++++++ intel_device_demo/openvino/convert.py | 72 +++++++++++ .../openvino/openvino_cli_demo.py | 122 ++++++++++++++++++ intel_device_demo/openvino/requirements.txt | 2 + 7 files changed, 342 insertions(+), 3 deletions(-) create mode 100644 intel_device_demo/openvino/README.md create mode 100644 intel_device_demo/openvino/README_en.md create mode 100644 intel_device_demo/openvino/convert.py create mode 100644 intel_device_demo/openvino/openvino_cli_demo.py create mode 100644 intel_device_demo/openvino/requirements.txt diff --git a/README.md b/README.md index 29e831d..960ec75 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,10 @@ Read this in [English](README_en.md) ## ้กน็›ฎๆ›ดๆ–ฐ -- ๐Ÿ”ฅ๐Ÿ”ฅ **News**: ``2024/6/24``: ๆˆ‘ไปฌๆ›ดๆ–ฐไบ†ๆจกๅž‹ไป“ๅบ“็š„่ฟ่กŒๆ–‡ไปถๅ’Œ้…็ฝฎๆ–‡ไปถ๏ผŒๆ”ฏๆŒ Flash Attention 2, +- ๐Ÿ”ฅ **News**: ``2024/6/28``: We have worked with the Intel technical team to improve the ITREX and OpenVINO deployment +tutorials for GLM-4-9B-Chat. You can use Intel CPU/GPU devices to efficiently deploy the GLM-4-9B open source model. +Welcome to [view](intel_device_demo). +- ๐Ÿ”ฅ **News**: ``2024/6/24``: ๆˆ‘ไปฌๆ›ดๆ–ฐไบ†ๆจกๅž‹ไป“ๅบ“็š„่ฟ่กŒๆ–‡ไปถๅ’Œ้…็ฝฎๆ–‡ไปถ๏ผŒๆ”ฏๆŒ Flash Attention 2, ่ฏทๆ›ดๆ–ฐๆจกๅž‹้…็ฝฎๆ–‡ไปถๅนถๅ‚่€ƒ `basic_demo/trans_cli_demo.py` ไธญ็š„็คบไพ‹ไปฃ็ ใ€‚ - ๐Ÿ”ฅ **News**: ``2024/6/19``: ๆˆ‘ไปฌๆ›ดๆ–ฐไบ†ๆจกๅž‹ไป“ๅบ“็š„่ฟ่กŒๆ–‡ไปถๅ’Œ้…็ฝฎๆ–‡ไปถ๏ผŒไฟฎๅคไบ†้ƒจๅˆ†ๅทฒ็Ÿฅ็š„ๆจกๅž‹ๆŽจ็†็š„้—ฎ้ข˜๏ผŒๆฌข่ฟŽๅคงๅฎถๅ…‹้š†ๆœ€ๆ–ฐ็š„ๆจกๅž‹ไป“ๅบ“ใ€‚ - ๐Ÿ”ฅ **News**: ``2024/6/18``: ๆˆ‘ไปฌๅ‘ๅธƒ [ๆŠ€ๆœฏๆŠฅๅ‘Š](https://arxiv.org/pdf/2406.12793), ๆฌข่ฟŽๆŸฅ็œ‹ใ€‚ diff --git a/README_en.md b/README_en.md index ba6efa1..1d867fb 100644 --- a/README_en.md +++ b/README_en.md @@ -5,11 +5,11 @@

๐Ÿ“Experience and use a larger-scale GLM business model on the Zhipu AI Open Platform -

## Update -- ๐Ÿ”ฅ๐Ÿ”ฅ **News**: ``2024/6/24``: We have updated the running files and configuration files of the model repository to support Flash Attention 2, +- ๐Ÿ”ฅ **News**: ``2024/6/28``: We have updated the running files and configuration files of the model repository to support Flash Attention 2, +- ๐Ÿ”ฅ **News**: ``2024/6/24``: We have updated the running files and configuration files of the model repository to support Flash Attention 2, Please update the model configuration file and refer to the sample code in `basic_demo/trans_cli_demo.py`. - ๐Ÿ”ฅ๐Ÿ”ฅ **News**: ``2024/6/19``: We updated the running files and configuration files of the model repository and fixed some model inference issues. Welcome to clone the latest model repository. - ๐Ÿ”ฅ **News**: ``2024/6/18``: We released a [technical report](https://arxiv.org/pdf/2406.12793), welcome to check it out. diff --git a/intel_device_demo/openvino/README.md b/intel_device_demo/openvino/README.md new file mode 100644 index 0000000..8814c9b --- /dev/null +++ b/intel_device_demo/openvino/README.md @@ -0,0 +1,70 @@ +# ไฝฟ็”จ OpenVINO ้ƒจ็ฝฒ GLM-4-9B-Chat ๆจกๅž‹ + +Read this in [English](README_en.md). + +[OpenVINO](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) +ๆ˜ฏ Intel ไธบๆทฑๅบฆๅญฆไน ๆŽจ็†่€Œ่ฎพ่ฎก็š„ๅผ€ๆบๅทฅๅ…ทๅŒ…ใ€‚ๅฎƒๅฏไปฅๅธฎๅŠฉๅผ€ๅ‘่€…ไผ˜ๅŒ–ๆจกๅž‹๏ผŒๆ้ซ˜ๆŽจ็†ๆ€ง่ƒฝ๏ผŒๅ‡ๅฐ‘ๆจกๅž‹็š„ๅ†…ๅญ˜ๅ ็”จใ€‚ +ๆœฌ็คบไพ‹ๅฐ†ๅฑ•็คบๅฆ‚ไฝ•ไฝฟ็”จ OpenVINO ้ƒจ็ฝฒ GLM-4-9B-Chat ๆจกๅž‹ใ€‚ + +## 1. ็Žฏๅขƒ้…็ฝฎ + +้ฆ–ๅ…ˆ๏ผŒไฝ ้œ€่ฆๅฎ‰่ฃ…ไพ่ต– + +```bash +pip install -r requirements.txt +``` + +## 2. ่ฝฌๆขๆจกๅž‹ + +็”ฑไบŽ้œ€่ฆๅฐ†Huggingfaceๆจกๅž‹่ฝฌๆขไธบOpenVINO IRๆจกๅž‹๏ผŒๅ› ๆญคๆ‚จ้œ€่ฆไธ‹่ฝฝๆจกๅž‹ๅนถ่ฝฌๆขใ€‚ + +``` +python3 convert.py --model_id THUDM/glm-4-9b-chat --output {your_path}/glm-4-9b-chat-ov +``` + +### ๅฏไปฅ้€‰ๆ‹ฉ็š„ๅ‚ๆ•ฐ + +* `--model_id` - ๆจกๅž‹ๆ‰€ๅœจ็›ฎๅฝ•็š„่ทฏๅพ„๏ผˆ็ปๅฏน่ทฏๅพ„๏ผ‰ใ€‚ +* `--output` - ่ฝฌๆขๅŽๆจกๅž‹ไฟๅญ˜็š„ๅœฐๅ€ใ€‚ +* `--precision` - ่ฝฌๆข็š„็ฒพๅบฆใ€‚ + + +่ฝฌๆข่ฟ‡็จ‹ๅฆ‚ไธ‹๏ผš +``` +====Exporting IR===== +Framework not specified. Using pt to export the model. +Loading checkpoint shards: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:04<00:00, 2.14it/s] +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Using framework PyTorch: 2.3.1+cu121 +Mixed-Precision assignment โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% 160/160 โ€ข 0:01:45 โ€ข 0:00:00 +INFO:nncf:Statistics of the bitwidth distribution: +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฏโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฏโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”‘ +โ”‚ Num bits (N) โ”‚ % all parameters (layers) โ”‚ % ratio-defining parameters (layers) โ”‚ +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฟโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฟโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฅ +โ”‚ 8 โ”‚ 31% (76 / 163) โ”‚ 20% (73 / 160) โ”‚ +โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค +โ”‚ 4 โ”‚ 69% (87 / 163) โ”‚ 80% (87 / 160) โ”‚ +โ”•โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ทโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ทโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”™ +Applying Weight Compression โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% 163/163 โ€ข 0:03:46 โ€ข 0:00:00 +Configuration saved in glm-4-9b-ov/openvino_config.json +====Exporting tokenizer===== +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +``` +## 3. ่ฟ่กŒ GLM-4-9B-Chat ๆจกๅž‹ + +``` +python3 chat.py --model_path {your_path}/glm-4-9b-chat-ov --max_sequence_length 4096 --device CPU +``` + +### ๅฏไปฅ้€‰ๆ‹ฉ็š„ๅ‚ๆ•ฐ + +* `--model_path` - OpenVINO IR ๆจกๅž‹ๆ‰€ๅœจ็›ฎๅฝ•็š„่ทฏๅพ„ใ€‚ +* `--max_sequence_length` - ่พ“ๅ‡บๆ ‡่ฎฐ็š„ๆœ€ๅคงๅคงๅฐใ€‚ +* `--device` - ่ฟ่กŒๆŽจ็†็š„่ฎพๅค‡ใ€‚ + +### ๅ‚่€ƒไปฃ็  + +ๆœฌไปฃ็ ๅ‚่€ƒ [OpenVINO ๅฎ˜ๆ–น็คบไพ‹](https://github.com/OpenVINO-dev-contest/chatglm3.openvino) ่ฟ›่กŒไฟฎๆ”นใ€‚ \ No newline at end of file diff --git a/intel_device_demo/openvino/README_en.md b/intel_device_demo/openvino/README_en.md new file mode 100644 index 0000000..d6b20fc --- /dev/null +++ b/intel_device_demo/openvino/README_en.md @@ -0,0 +1,70 @@ +# Deploy the GLM-4-9B-Chat model using OpenVINO + +[OpenVINO](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) +is an open source toolkit designed by Intel for deep learning inference. It can help developers optimize models, improve inference performance, and reduce model memory usage. +This example will show how to deploy the GLM-4-9B-Chat model using OpenVINO. + +## 1. Environment configuration + +First, you need to install the dependencies + +```bash +pip install -r requirements.txt +``` + +## 2. Convert the model + +Since the Huggingface model needs to be converted to an OpenVINO IR model, you need to download the model and convert it. + +``` +python3 convert.py --model_id THUDM/glm-4-9b-chat --output {your_path}/glm-4-9b-chat-ov +``` +The conversion process is as follows: +``` +====Exporting IR===== +Framework not specified. Using pt to export the model. +Loading checkpoint shards: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 10/10 [00:04<00:00, 2.14it/s] +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +Using framework PyTorch: 2.3.1+cu121 +Mixed-Precision assignment โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% 160/160 โ€ข 0:01:45 โ€ข 0:00:00 +INFO:nncf:Statistics of the bitwidth distribution: +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฏโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฏโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”‘ +โ”‚ Num bits (N) โ”‚ % all parameters (layers) โ”‚ % ratio-defining parameters (layers) โ”‚ +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฟโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฟโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฅ +โ”‚ 8 โ”‚ 31% (76 / 163) โ”‚ 20% (73 / 160) โ”‚ +โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค +โ”‚ 4 โ”‚ 69% (87 / 163) โ”‚ 80% (87 / 160) โ”‚ +โ”•โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ทโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ทโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”™ +Applying Weight Compression โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” 100% 163/163 โ€ข 0:03:46 โ€ข 0:00:00 +Configuration saved in glm-4-9b-ov/openvino_config.json +====Exporting tokenizer===== +Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +``` + +### Optional parameters + +* `--model_id` - Path to the directory where the model is located (absolute path). + +* `--output` - Path to where the converted model is saved. + +* `--precision` - Precision of the conversion. + +## 3. Run the GLM-4-9B-Chat model + +``` +python3 chat.py --model_path {your_path}glm-4-9b-chat-ov --max_sequence_length 4096 --device CPU +``` + +### Optional parameters + +* `--model_path` - Path to the directory where the OpenVINO IR model is located. + +* `--max_sequence_length` - Maximum size of the output token. +* `--device` - the device to run inference on. + +### Reference code + +This code is modified based on the [OpenVINO official example](https://github.com/OpenVINO-dev-contest/chatglm3.openvino). \ No newline at end of file diff --git a/intel_device_demo/openvino/convert.py b/intel_device_demo/openvino/convert.py new file mode 100644 index 0000000..67e0572 --- /dev/null +++ b/intel_device_demo/openvino/convert.py @@ -0,0 +1,72 @@ +""" +This script is used to convert the original model to OpenVINO IR format. +The Origin Code can check https://github.com/OpenVINO-dev-contest/chatglm3.openvino/blob/main/convert.py +""" +from transformers import AutoTokenizer, AutoConfig +from optimum.intel import OVWeightQuantizationConfig +from optimum.intel.openvino import OVModelForCausalLM + +import os +from pathlib import Path +import argparse + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument('-h', + '--help', + action='help', + help='Show this help message and exit.') + parser.add_argument('-m', + '--model_id', + default='THUDM/glm-4-9b-chat', + required=False, + type=str, + help='orignal model path') + parser.add_argument('-p', + '--precision', + required=False, + default="int4", + type=str, + choices=["fp16", "int8", "int4"], + help='fp16, int8 or int4') + parser.add_argument('-o', + '--output', + default='./glm-4-9b-ov', + required=False, + type=str, + help='Required. path to save the ir model') + args = parser.parse_args() + + ir_model_path = Path(args.output) + if ir_model_path.exists() == False: + os.mkdir(ir_model_path) + + model_kwargs = { + "trust_remote_code": True, + "config": AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), + } + compression_configs = { + "sym": False, + "group_size": 128, + "ratio": 0.8, + } + + print("====Exporting IR=====") + if args.precision == "int4": + ov_model = OVModelForCausalLM.from_pretrained(args.model_id, export=True, + compile=False, quantization_config=OVWeightQuantizationConfig( + bits=4, **compression_configs), **model_kwargs) + elif args.precision == "int8": + ov_model = OVModelForCausalLM.from_pretrained(args.model_id, export=True, + compile=False, load_in_8bit=True, **model_kwargs) + else: + ov_model = OVModelForCausalLM.from_pretrained(args.model_id, export=True, + compile=False, load_in_8bit=False, **model_kwargs) + + ov_model.save_pretrained(ir_model_path) + + print("====Exporting tokenizer=====") + tokenizer = AutoTokenizer.from_pretrained( + args.model_id, trust_remote_code=True) + tokenizer.save_pretrained(ir_model_path) \ No newline at end of file diff --git a/intel_device_demo/openvino/openvino_cli_demo.py b/intel_device_demo/openvino/openvino_cli_demo.py new file mode 100644 index 0000000..cfc7b55 --- /dev/null +++ b/intel_device_demo/openvino/openvino_cli_demo.py @@ -0,0 +1,122 @@ +import argparse +from typing import List, Tuple +from threading import Thread +import torch +from optimum.intel.openvino import OVModelForCausalLM +from transformers import (AutoTokenizer, AutoConfig, + TextIteratorStreamer, StoppingCriteriaList, StoppingCriteria) + +class StopOnTokens(StoppingCriteria): + def __init__(self, token_ids): + self.token_ids = token_ids + + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs + ) -> bool: + for stop_id in self.token_ids: + if input_ids[0][-1] == stop_id: + return True + return False + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument('-h', + '--help', + action='help', + help='Show this help message and exit.') + parser.add_argument('-m', + '--model_path', + required=True, + type=str, + help='Required. model path') + parser.add_argument('-l', + '--max_sequence_length', + default=256, + required=False, + type=int, + help='Required. maximun length of output') + parser.add_argument('-d', + '--device', + default='CPU', + required=False, + type=str, + help='Required. device for inference') + args = parser.parse_args() + model_dir = args.model_path + + ov_config = {"PERFORMANCE_HINT": "LATENCY", + "NUM_STREAMS": "1", "CACHE_DIR": ""} + + tokenizer = AutoTokenizer.from_pretrained( + model_dir, trust_remote_code=True) + + print("====Compiling model====") + ov_model = OVModelForCausalLM.from_pretrained( + model_dir, + device=args.device, + ov_config=ov_config, + config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), + trust_remote_code=True, + ) + + streamer = TextIteratorStreamer( + tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True + ) + stop_tokens = [StopOnTokens([151329, 151336, 151338])] + + def convert_history_to_token(history: List[Tuple[str, str]]): + + messages = [] + for idx, (user_msg, model_msg) in enumerate(history): + if idx == len(history) - 1 and not model_msg: + messages.append({"role": "user", "content": user_msg}) + break + if user_msg: + messages.append({"role": "user", "content": user_msg}) + if model_msg: + messages.append({"role": "assistant", "content": model_msg}) + + model_inputs = tokenizer.apply_chat_template(messages, + add_generation_prompt=True, + tokenize=True, + return_tensors="pt") + return model_inputs + + history = [] + print("====Starting conversation====") + while True: + input_text = input("็”จๆˆท: ") + if input_text.lower() == 'stop': + break + + if input_text.lower() == 'clear': + history = [] + print("AIๅŠฉๆ‰‹: ๅฏน่ฏๅކๅฒๅทฒๆธ…็ฉบ") + continue + + print("GLM-4-9B-OpenVINO:", end=" ") + history = history + [[input_text, ""]] + model_inputs = convert_history_to_token(history) + generate_kwargs = dict( + input_ids=model_inputs, + max_new_tokens=args.max_sequence_length, + temperature=0.1, + do_sample=True, + top_p=1.0, + top_k=50, + repetition_penalty=1.1, + streamer=streamer, + stopping_criteria=StoppingCriteriaList(stop_tokens) + ) + + t1 = Thread(target=ov_model.generate, kwargs=generate_kwargs) + t1.start() + + partial_text = "" + for new_text in streamer: + new_text = new_text + print(new_text, end="", flush=True) + partial_text += new_text + print("\n") + history[-1][1] = partial_text \ No newline at end of file diff --git a/intel_device_demo/openvino/requirements.txt b/intel_device_demo/openvino/requirements.txt new file mode 100644 index 0000000..9c74afb --- /dev/null +++ b/intel_device_demo/openvino/requirements.txt @@ -0,0 +1,2 @@ +optimum>=1.20.0 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@c1ee8ac0864e25e22ea56b5a37a35451531da0e6 \ No newline at end of file