From 4ab7a1efd15294b5bd0b2f4978400910f90a6166 Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Tue, 16 Jul 2024 17:08:50 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=9B=B4=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                      |  4 ++--
 README_en.md                   |  7 ++++---
 basic_demo/requirements.txt    | 33 +++++++++++++--------------------
 basic_demo/vllm_cli_demo.py    |  5 ++---
 finetune_demo/requirements.txt |  4 ++--
 5 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 6d439ef..ac1293b 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@
 Read this in [English](README_en.md)
 
 ## 项目更新
-
-- 🔥🔥 **News**: ``2024/7/9``: GLM-4-9B-Chat
+- 🔥🔥 **News**: ``2024/7/16``: GLM-4-9B-Chat 模型依赖的` transformers`版本升级到 `4.42.4`, 请更新模型配置文件并参考 `basic_demo/requirements.txt` 更新依赖。
+- 🔥 **News**: ``2024/7/9``: GLM-4-9B-Chat
   模型已适配 [Ollama](https://github.com/ollama/ollama),[Llama.cpp](https://github.com/ggerganov/llama.cpp)，您可以在[PR](https://github.com/ggerganov/llama.cpp/pull/8031) 查看具体的细节。
 - 🔥 **News**: ``2024/7/1``: 我们更新了 GLM-4V-9B 的微调，您需要更新我们的模型仓库的运行文件和配置文件，
   以支持这个功能，更多微调细节 (例如数据集格式，显存要求)，请前往 [查看](finetune_demo)。
diff --git a/README_en.md b/README_en.md
index b9afc85..e094288 100644
--- a/README_en.md
+++ b/README_en.md
@@ -8,9 +8,10 @@
 </p>
 
 ## Update
-
-- 🔥🔥 **News**: ``2024/7/9``: The GLM-4-9B-Chat model has been adapted to [Ollama](https://github.com/ollama/ollama)
-  and  [Llama.cpp](https://github.com/ggerganov/llama.cpp), you can check the specific details
+- 🔥🔥 **News**: ``2024/7/16``: The ` transformers` version that the GLM-4-9B-Chat model depends on has been upgraded 
+to `4.42.4`. Please update the model configuration file and refer to `basic_demo/requirements.txt` to update the dependencies.
+- 🔥 **News**: ``2024/7/9``: The GLM-4-9B-Chat model has been adapted to [Ollama](https://github.com/ollama/ollama)
+  and [Llama.cpp](https://github.com/ggerganov/llama.cpp), you can check the specific details
   in [PR](https://github.com/ggerganov/llama.cpp/pull/8031).
 - 🔥 **News**: ``2024/7/1``: We have updated the multimodal fine-tuning of GLM-4V-9B. You need to update the run file and
   configuration file of our model repository to support this feature. For more fine-tuning details (such as dataset
diff --git a/basic_demo/requirements.txt b/basic_demo/requirements.txt
index 9ab348f..3878fe2 100644
--- a/basic_demo/requirements.txt
+++ b/basic_demo/requirements.txt
@@ -1,27 +1,20 @@
-# use vllm
-# vllm>=0.5.0
-
 torch>=2.3.0
 torchvision>=0.18.0
-transformers==4.40.0
+transformers==4.42.4
 huggingface-hub>=0.23.1
 sentencepiece>=0.2.0
-pydantic>=2.7.1
-timm>=0.9.16
+pydantic>=2.8.2
+timm>=1.0.7
 tiktoken>=0.7.0
-accelerate>=0.30.1
-sentence_transformers>=2.7.0
-
-# web demo
-gradio>=4.33.0
-
-# openai demo
-openai>=1.34.0
-einops>=0.7.0
-sse-starlette>=2.1.0
-
-# INT4
-bitsandbytes>=0.43.1
+accelerate>=0.32.1
+sentence_transformers>=3.0.1
+gradio>=4.38.1 # web demo
+openai>=1.35.0 # openai demo
+einops>=0.8.0
+sse-starlette>=2.1.2
+bitsandbytes>=0.43.1 # INT4 Loading
 
+# vllm>=0.5.2
+# flash-attn>=2.5.9 # using with flash-attention 2
 # PEFT model, not need if you don't use PEFT finetune model.
-# peft>=0.11.0
\ No newline at end of file
+# peft>=0.11.1
\ No newline at end of file
diff --git a/basic_demo/vllm_cli_demo.py b/basic_demo/vllm_cli_demo.py
index f6acf0f..b5cc0a3 100644
--- a/basic_demo/vllm_cli_demo.py
+++ b/basic_demo/vllm_cli_demo.py
@@ -15,7 +15,7 @@ from transformers import AutoTokenizer
 from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine
 from typing import List, Dict
 
-MODEL_PATH = 'THUDM/glm-4-9b'
+MODEL_PATH = 'THUDM/glm-4-9b-chat'
 
 
 def load_model_and_tokenizer(model_dir: str):
@@ -25,7 +25,7 @@ def load_model_and_tokenizer(model_dir: str):
         tensor_parallel_size=1,
         dtype="bfloat16",
         trust_remote_code=True,
-        gpu_memory_utilization=0.3,
+        gpu_memory_utilization=0.9,
         enforce_eager=True,
         worker_use_ray=True,
         engine_use_ray=False,
@@ -63,7 +63,6 @@ async def vllm_gen(messages: List[Dict[str, str]], top_p: float, temperature: fl
         "use_beam_search": False,
         "length_penalty": 1,
         "early_stopping": False,
-        "stop_token_ids": [151329, 151336, 151338],
         "ignore_eos": False,
         "max_tokens": max_dec_len,
         "logprobs": None,
diff --git a/finetune_demo/requirements.txt b/finetune_demo/requirements.txt
index e805a5b..4485eec 100644
--- a/finetune_demo/requirements.txt
+++ b/finetune_demo/requirements.txt
@@ -1,7 +1,7 @@
 jieba>=0.42.1
-datasets>2.20.0
+datasets>=2.20.0
 peft>=0.11.1
-deepspeed>=0.14.3
+deepspeed>=0.14.4
 nltk==3.8.1
 rouge_chinese>=1.0.3
 ruamel.yaml>=0.18.6
\ No newline at end of file