From e3e6de52c45290291f984cfe934839d0954a17ef Mon Sep 17 00:00:00 2001
From: sixgod <evethwillbeok@outlook.com>
Date: Fri, 1 Nov 2024 09:00:39 +0000
Subject: [PATCH] transformers4.46 and vllm0.6.3

---
 README.md                           | 21 ++++++++++++---------
 README_en.md                        | 20 +++++++++++---------
 basic_demo/glm4v_server.py          |  5 ++++-
 basic_demo/glm_server.py            |  5 +----
 basic_demo/openai_api_request.py    | 15 ++++++++++-----
 basic_demo/requirements.txt         |  2 +-
 basic_demo/trans_cli_vision_demo.py |  2 +-
 basic_demo/trans_web_demo.py        |  5 ++++-
 basic_demo/trans_web_vision_demo.py |  4 ++--
 9 files changed, 46 insertions(+), 33 deletions(-)
diff --git a/README.md b/README.md
index b82ef4b..72eb90b 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ Read this in [English](README_en.md)
 
 ## 项目更新
 
+- 🔥🔥 **News**: ```2024/11/01```: 支持了 GLM-4-9B-Chat-hf 和 GLM-4v-9B 模型在 vLLM 0.6.3 以上版本和 transformers 4.46.0 以上版本运行
 - 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
 - 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
 - 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
@@ -54,12 +55,14 @@ GLM-4V-9B。**GLM-4V-9B** 具备 1120 * 1120 高分辨率下的中英双语多
 
 ## Model List
 
-| Model            | Type | Seq Length | Download                                                                                                                                                                                                      | Online Demo                                                                                                                                                                                |
-|------------------|------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GLM-4-9B         | Base | 8K         | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/glm-4-9b)                        | /                                                                                                                                                                                          |
-| GLM-4-9B-Chat    | Chat | 128K       | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat)         | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
-| GLM-4-9B-Chat-1M | Chat | 1M         | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m)  [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | /                                                                                                                                                                                          |
-| GLM-4V-9B        | Chat | 8K         | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B  )                   | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary)                                                                                                              |
+| Model               | Type | Seq Length | Transformers |   vLLM   | Download                                                                                                                                                                                                      | Online Demo                                                                                                                                                                                |
+|---------------------|------|------------|--------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| GLM-4-9B            | Base | 8K         |   <= 4.45    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/glm-4-9b)                        | /                                                                                                                                                                                          |
+| GLM-4-9B-Chat       | Chat | 128K       |   <= 4.45    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat)         | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
+| GLM-4-9B-Chat-HF    | Chat | 128K       |   >= 4.46    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-hf)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-hf)                                                                       | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
+| GLM-4-9B-Chat-1M    | Chat | 1M         |   <= 4.45    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m)  [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | /                                                                                                                                                                                          |
+| GLM-4-9B-Chat-1M-HF | Chat | 1M         |   >= 4.46    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m-hf)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m-hf)                                                                 | /                                                                                                                                                                                          |
+| GLM-4V-9B           | Chat | 8K         |   >= 4.46    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B)                     | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary)                                                                                                              |
 
 ## 评测结果
 
@@ -151,7 +154,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
 
 os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 设置 GPU 编号，如果单机单卡指定一个，单机多卡指定多个 GPU 编号
-MODEL_PATH = "THUDM/glm-4-9b-chat"
+MODEL_PATH = "THUDM/glm-4-9b-chat-hf"
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -192,7 +195,7 @@ from vllm import LLM, SamplingParams
 # max_model_len, tp_size = 1048576, 4
 # 如果遇见 OOM 现象，建议减少max_model_len，或者增加tp_size
 max_model_len, tp_size = 131072, 1
-model_name = "THUDM/glm-4-9b-chat"
+model_name = "THUDM/glm-4-9b-chat-hf"
 prompt = [{"role": "user", "content": "你好"}]
 
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -290,7 +293,7 @@ for o in outputs:
 
 ## 完整项目列表
 
-如果你想更进一步了解 GLM-4-9B 系列开源模型，本开源仓库通过以下内容为开发者提供基础的 GLM-4-9B的使用和开发代码
+如果你想更进一步了解 GLM-4-9B 系列开源模型，本开源仓库通过以下内容为开发者提供基础的 GLM-4-9B 的使用和开发代码
 
 + [basic_demo](basic_demo/README.md): 在这里包含了
     + 使用 transformers 和 vLLM 后端的交互代码
diff --git a/README_en.md b/README_en.md
index 4719a3e..554f0cd 100644
--- a/README_en.md
+++ b/README_en.md
@@ -8,7 +8,7 @@
 </p>
 
 ## Update
-
+- 🔥🔥 **News**: ```2024/11/01```: Support for GLM-4-9B-Chat-hf and GLM-4v-9B models on vLLM >= 0.6.3 and transformers >= 4.46.0
 - 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
 - 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
 - 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
@@ -67,12 +67,14 @@ GPT-4-turbo-2024-04-09, Gemini 1.0 Pro, Qwen-VL-Max, and Claude 3 Opus.
 
 ## Model List
 
-| Model            | Type | Seq Length | Download                                                                                                                                                                                                     | Online Demo                                                                                                                                                                                |
-|------------------|------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GLM-4-9B         | Base | 8K         | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b)  [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B)                        | /                                                                                                                                                                                          |
-| GLM-4-9B-Chat    | Chat | 128K       | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat)          | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
-| GLM-4-9B-Chat-1M | Chat | 1M         | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | /                                                                                                                                                                                          |
-| GLM-4V-9B        | Chat | 8K         | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B)                      | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary)                                                                                                              |
+| Model               | Type | Seq Length | Transformers |   vLLM   | Download                                                                                                                                                                                                      | Online Demo                                                                                                                                                                                |
+|---------------------|------|------------|--------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| GLM-4-9B            | Base | 8K         |   <= 4.45    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/glm-4-9b)                        | /                                                                                                                                                                                          |
+| GLM-4-9B-Chat       | Chat | 128K       |   <= 4.45    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat)         | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
+| GLM-4-9B-Chat-HF    | Chat | 128K       |   >= 4.46    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-hf)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-hf)                                                                       | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
+| GLM-4-9B-Chat-1M    | Chat | 1M         |   <= 4.45    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m)  [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | /                                                                                                                                                                                          |
+| GLM-4-9B-Chat-1M-HF | Chat | 1M         |   >= 4.46    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m-hf)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m-hf)                                                                 | /                                                                                                                                                                                          |
+| GLM-4V-9B           | Chat | 8K         |   >= 4.46    | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b)  [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)   [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B)                     | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary)                                                                                                              |
 
 ## BenchMark
 
@@ -168,7 +170,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
 
 os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Set the GPU number. If inference with multiple GPUs, set multiple GPU numbers
-MODEL_PATH = "THUDM/glm-4-9b-chat"
+MODEL_PATH = "THUDM/glm-4-9b-chat-hf"
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -208,7 +210,7 @@ from vllm import LLM, SamplingParams
 # GLM-4-9B-Chat
 # If you encounter OOM, you can try to reduce max_model_len or increase tp_size
 max_model_len, tp_size = 131072, 1
-model_name = "THUDM/glm-4-9b-chat"
+model_name = "THUDM/glm-4-9b-chat-hf"
 prompt = [{"role": "user", "content": "你好"}]
 
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
diff --git a/basic_demo/glm4v_server.py b/basic_demo/glm4v_server.py
index cffa944..549f661 100644
--- a/basic_demo/glm4v_server.py
+++ b/basic_demo/glm4v_server.py
@@ -302,7 +302,10 @@ def generate_stream_glm4v(model: AutoModel, tokenizer: AutoTokenizer, params: di
             inputs.append({"role": "user", "content": user_msg})
         if model_msg:
             inputs.append({"role": "assistant", "content": model_msg})
-    inputs.append({"role": "user", "content": query, "image": image_list[0]})
+    if len(image_list) >= 1:
+        inputs.append({"role": "user", "content": query, "image": image_list[0]})
+    else:
+        inputs.append({"role": "user", "content": query})
 
     model_inputs = tokenizer.apply_chat_template(
         inputs,
diff --git a/basic_demo/glm_server.py b/basic_demo/glm_server.py
index 2ae8b22..d975dea 100644
--- a/basic_demo/glm_server.py
+++ b/basic_demo/glm_server.py
@@ -207,9 +207,6 @@ async def generate_stream_glm4(params):
         "top_p": top_p,
         "top_k": -1,
         "repetition_penalty": repetition_penalty,
-        "use_beam_search": False,
-        "length_penalty": 1,
-        "early_stopping": False,
         "stop_token_ids": [151329, 151336, 151338],
         "ignore_eos": False,
         "max_tokens": max_new_tokens,
@@ -218,7 +215,7 @@ async def generate_stream_glm4(params):
         "skip_special_tokens": True,
     }
     sampling_params = SamplingParams(**params_dict)
-    async for output in engine.generate(inputs=inputs, sampling_params=sampling_params, request_id=f"{time.time()}"):
+    async for output in engine.generate(prompt=inputs, sampling_params=sampling_params, request_id=f"{time.time()}"):
         output_len = len(output.outputs[0].token_ids)
         input_len = len(output.prompt_token_ids)
         ret = {
diff --git a/basic_demo/openai_api_request.py b/basic_demo/openai_api_request.py
index 92bee35..f1c7fab 100644
--- a/basic_demo/openai_api_request.py
+++ b/basic_demo/openai_api_request.py
@@ -95,12 +95,12 @@ def function_chat(use_stream=False):
 def simple_chat(use_stream=False):
     messages = [
         {
-            "role": "system",
+            "role": "user",
             "content": "请在你输出的时候都带上“喵喵喵”三个字，放在开头。",
         },
         {
             "role": "user",
-            "content": "你是谁"
+            "content": "你是猫吗"
         }
     ]
     response = client.chat.completions.create(
@@ -201,7 +201,12 @@ def glm4v_simple_image_chat(use_stream=False, img_path=None):
     
 
 if __name__ == "__main__":
-    simple_chat(use_stream=False)
-    # function_chat(use_stream=False)
-    # glm4v_simple_image_chat(use_stream=False, img_path="demo.jpg")
+    # Testing the text model
+    simple_chat(use_stream=False) 
+
+    # Testing the text model with tools
+    # function_chat(use_stream=False) 
+    
+    # Testing images of multimodal models
+    # glm4v_simple_image_chat(use_stream=False, img_path="demo.jpg") 
 
diff --git a/basic_demo/requirements.txt b/basic_demo/requirements.txt
index 4ff1483..0480a78 100644
--- a/basic_demo/requirements.txt
+++ b/basic_demo/requirements.txt
@@ -17,7 +17,7 @@ pillow>=10.4.0
 sse-starlette>=2.1.3
 bitsandbytes>=0.43.3 # INT4 Loading
 
-# vllm>=0.6.4 # using with VLLM Framework
+# vllm>=0.6.3 # using with VLLM Framework
 # flash-attn>=2.6.3 # using with flash-attention 2
 # PEFT model, not need if you don't use PEFT finetune model.
 # peft>=0.13.0 # Using with finetune model
\ No newline at end of file
diff --git a/basic_demo/trans_cli_vision_demo.py b/basic_demo/trans_cli_vision_demo.py
index 30a78d2..758ccc6 100644
--- a/basic_demo/trans_cli_vision_demo.py
+++ b/basic_demo/trans_cli_vision_demo.py
@@ -17,7 +17,7 @@ from transformers import (
     AutoTokenizer,
     StoppingCriteria,
     StoppingCriteriaList,
-    TextIteratorStreamer, AutoModel, BitsAndBytesConfig
+    TextIteratorStreamer, AutoModel
 )
 
 from PIL import Image
diff --git a/basic_demo/trans_web_demo.py b/basic_demo/trans_web_demo.py
index 1a470de..2d8e35a 100644
--- a/basic_demo/trans_web_demo.py
+++ b/basic_demo/trans_web_demo.py
@@ -3,6 +3,9 @@ This script creates an interactive web demo for the GLM-4-9B model using Gradio,
 a Python library for building quick and easy UI components for machine learning models.
 It's designed to showcase the capabilities of the GLM-4-9B model in a user-friendly interface,
 allowing users to interact with the model through a chat-like interface.
+
+Note:
+    Using with glm-4-9b-chat-hf will require `transformers>=4.46.0".
 """
 
 import os
@@ -27,7 +30,7 @@ from transformers import (
 ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
 TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 
-MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat-hf')
 TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
 
 
diff --git a/basic_demo/trans_web_vision_demo.py b/basic_demo/trans_web_vision_demo.py
index 05d1563..91d9875 100644
--- a/basic_demo/trans_web_vision_demo.py
+++ b/basic_demo/trans_web_vision_demo.py
@@ -7,7 +7,7 @@ Usage:
 
 Requirements:
 - Gradio package
-  - Type `pip install gradio` to install Gradio.
+  - Type `pip install gradio==4.44.1` to install Gradio.
 """
 
 import os
@@ -18,7 +18,7 @@ from transformers import (
     AutoTokenizer,
     StoppingCriteria,
     StoppingCriteriaList,
-    TextIteratorStreamer, AutoModel, BitsAndBytesConfig
+    TextIteratorStreamer, AutoModel
 )
 from PIL import Image
 import requests