fix vllm old code to new one

2024-06-05 13:42:10 +08:00 · 2024-06-05 13:42:10 +08:00 · 1b387cc08d
parent 29480b7394
commit 1b387cc08d
3 changed files with 20 additions and 16 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,3 @@
 *venv
 *.DS_Store
-*base_model
-*multimodal
-chat_model
 *.idea/
--- a/README.md
+++ b/README.md
@ -11,8 +11,8 @@ Read this in [English](README_en.md)

 ## 模型介绍

-GLM-4-9B 是智谱 AI 推出的最新一代预训练模型 GLM-4 系列中的开源版本。 在语义、数学、推理、代码和知识等多方面的数据集测评中，**GLM-4-9B**
-及其人类偏好对齐的版本 **GLM-4-9B-Chat** 均表现出超越 Llama-3-8B 的卓越性能。除了能进行多轮对话，GLM-4-9B-Chat
+GLM-4-9B 是智谱 AI 推出的最新一代预训练模型 GLM-4 系列中的开源版本。 在语义、数学、推理、代码和知识等多方面的数据集测评中，
+**GLM-4-9B** 及其人类偏好对齐的版本 **GLM-4-9B-Chat** 均表现出超越 Llama-3-8B 的卓越性能。除了能进行多轮对话，GLM-4-9B-Chat
 还具备网页浏览、代码执行、自定义工具调用（Function Call）和长文本推理（支持最大 128K 上下文）等高级功能。本代模型增加了多语言支持，支持包括日语，韩语，德语在内的
 26 种语言。我们还推出了支持 1M 上下文长度（约 200 万中文字符）的 **GLM-4-9B-Chat-1M** 模型和基于 GLM-4-9B 的多模态模型
 GLM-4V-9B。**GLM-4V-9B** 具备 1120 * 1120 高分辨率下的中英双语多轮对话能力，在中英文综合能力、感知推理、文字识别、图表理解等多方面多模态评测中，GLM-4V-9B
@ -152,9 +152,13 @@ from vllm import LLM, SamplingParams
 # max_model_len, tp_size = 1048576, 4

 # GLM-4-9B-Chat
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+# 如果遇见 OOM 现象，建议减少max_model_len，或者增加tp_size
 max_model_len, tp_size = 131072, 1
 model_name = "THUDM/glm-4-9b-chat"
-prompt = '你好'
+prompt = [{"role": "user", "content": "你好"}]

 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 llm = LLM(
@ -170,11 +174,10 @@ llm = LLM(
 stop_token_ids = [151329, 151336, 151338]
 sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids)

-inputs = tokenizer.build_chat_input(prompt, history=None, role='user')['input_ids'].tolist()
-outputs = llm.generate(prompt_token_ids=inputs, sampling_params=sampling_params)
+inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+outputs = llm.generate(prompts=inputs, sampling_params=sampling_params)

-generated_text = [output.outputs[0].text for output in outputs]
-print(generated_text)
+print(outputs[0].outputs[0].text)
 ```

 ### 使用以下方法快速调用 GLM-4V-9B 多模态模型
--- a/README_en.md
+++ b/README_en.md
@ -158,9 +158,13 @@ from vllm import LLM, SamplingParams
 # max_model_len, tp_size = 1048576, 4

 # GLM-4-9B-Chat
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+# If you encounter OOM, you can try to reduce max_model_len or increase tp_size
 max_model_len, tp_size = 131072, 1
 model_name = "THUDM/glm-4-9b-chat"
-prompt = '你好'
+prompt = [{"role": "user", "content": "你好"}]

 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 llm = LLM(
@ -169,18 +173,18 @@ llm = LLM(
    max_model_len=max_model_len,
    trust_remote_code=True,
    enforce_eager=True,
-    # GLM-4-9B-Chat-1M If you encounter OOM phenomenon, it is recommended to turn on the following parameters
+    # if you encounter OOM in GLM-4-9B-Chat-1M, you can try to enable the following parameters
    # enable_chunked_prefill=True,
    # max_num_batched_tokens=8192
 )
 stop_token_ids = [151329, 151336, 151338]
 sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids)

-inputs = tokenizer.build_chat_input(prompt, history=None, role='user')['input_ids'].tolist()
-outputs = llm.generate(prompt_token_ids=inputs, sampling_params=sampling_params)
+inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+outputs = llm.generate(prompts=inputs, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)

-generated_text = [output.outputs[0].text for output in outputs]
-print(generated_text)
 ```

 ### Use the following method to quickly call the GLM-4V-9B multimodal model