Merge pull request #634 from sixsixcoder/main
Support for GLM-4-9B-Chat-hf and GLM-4v-9B models on vLLM >= 0.6.3 and transformers >= 4.46.0
This commit is contained in:
commit
bca86f8c8e
21
README.md
21
README.md
|
@ -11,6 +11,7 @@ Read this in [English](README_en.md)
|
||||||
|
|
||||||
## 项目更新
|
## 项目更新
|
||||||
|
|
||||||
|
- 🔥🔥 **News**: ```2024/11/01```: 支持了 GLM-4-9B-Chat-hf 和 GLM-4v-9B 模型在 vLLM 0.6.3 以上版本和 transformers 4.46.0 以上版本运行
|
||||||
- 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
|
- 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
|
||||||
- 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
|
- 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
|
||||||
- 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
|
- 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
|
||||||
|
@ -54,12 +55,14 @@ GLM-4V-9B。**GLM-4V-9B** 具备 1120 * 1120 高分辨率下的中英双语多
|
||||||
|
|
||||||
## Model List
|
## Model List
|
||||||
|
|
||||||
| Model | Type | Seq Length | Download | Online Demo |
|
| Model | Type | Seq Length | Transformers | vLLM | Download | Online Demo |
|
||||||
|------------------|------|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|:-------------------:|:----:|:----------:|:------------:|:--------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||||
| GLM-4-9B | Base | 8K | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/glm-4-9b) | / |
|
| GLM-4-9B | Base | 8K | <= 4.45 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/glm-4-9b) | / |
|
||||||
| GLM-4-9B-Chat | Chat | 128K | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat) | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
|
| GLM-4-9B-Chat | Chat | 128K | <= 4.45 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat) | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
|
||||||
| GLM-4-9B-Chat-1M | Chat | 1M | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | / |
|
| GLM-4-9B-Chat-HF | Chat | 128K | >= 4.46 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-hf)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-hf) | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
|
||||||
| GLM-4V-9B | Chat | 8K | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B ) | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary) |
|
| GLM-4-9B-Chat-1M | Chat | 1M | <= 4.45 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | / |
|
||||||
|
| GLM-4-9B-Chat-1M-HF | Chat | 1M | >= 4.46 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m-hf)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m-hf) | / |
|
||||||
|
| GLM-4V-9B | Chat | 8K | >= 4.46 | >= 0.6.3 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B) | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary) |
|
||||||
|
|
||||||
## 评测结果
|
## 评测结果
|
||||||
|
|
||||||
|
@ -151,7 +154,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 设置 GPU 编号,如果单机单卡指定一个,单机多卡指定多个 GPU 编号
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 设置 GPU 编号,如果单机单卡指定一个,单机多卡指定多个 GPU 编号
|
||||||
MODEL_PATH = "THUDM/glm-4-9b-chat"
|
MODEL_PATH = "THUDM/glm-4-9b-chat-hf"
|
||||||
|
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
@ -192,7 +195,7 @@ from vllm import LLM, SamplingParams
|
||||||
# max_model_len, tp_size = 1048576, 4
|
# max_model_len, tp_size = 1048576, 4
|
||||||
# 如果遇见 OOM 现象,建议减少max_model_len,或者增加tp_size
|
# 如果遇见 OOM 现象,建议减少max_model_len,或者增加tp_size
|
||||||
max_model_len, tp_size = 131072, 1
|
max_model_len, tp_size = 131072, 1
|
||||||
model_name = "THUDM/glm-4-9b-chat"
|
model_name = "THUDM/glm-4-9b-chat-hf"
|
||||||
prompt = [{"role": "user", "content": "你好"}]
|
prompt = [{"role": "user", "content": "你好"}]
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
@ -290,7 +293,7 @@ for o in outputs:
|
||||||
|
|
||||||
## 完整项目列表
|
## 完整项目列表
|
||||||
|
|
||||||
如果你想更进一步了解 GLM-4-9B 系列开源模型,本开源仓库通过以下内容为开发者提供基础的 GLM-4-9B的使用和开发代码
|
如果你想更进一步了解 GLM-4-9B 系列开源模型,本开源仓库通过以下内容为开发者提供基础的 GLM-4-9B 的使用和开发代码
|
||||||
|
|
||||||
+ [basic_demo](basic_demo/README.md): 在这里包含了
|
+ [basic_demo](basic_demo/README.md): 在这里包含了
|
||||||
+ 使用 transformers 和 vLLM 后端的交互代码
|
+ 使用 transformers 和 vLLM 后端的交互代码
|
||||||
|
|
23
README_en.md
23
README_en.md
|
@ -8,7 +8,7 @@
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## Update
|
## Update
|
||||||
|
- 🔥🔥 **News**: ```2024/11/01```: Support for GLM-4-9B-Chat-hf and GLM-4v-9B models on vLLM >= 0.6.3 and transformers >= 4.46.0
|
||||||
- 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
|
- 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
|
||||||
- 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
|
- 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
|
||||||
- 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
|
- 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
|
||||||
|
@ -67,14 +67,17 @@ GPT-4-turbo-2024-04-09, Gemini 1.0 Pro, Qwen-VL-Max, and Claude 3 Opus.
|
||||||
|
|
||||||
## Model List
|
## Model List
|
||||||
|
|
||||||
| Model | Type | Seq Length | Download | Online Demo |
|
| Model | Type | Seq Length | Transformers | vLLM | Download | Online Demo |
|
||||||
|------------------|------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|:-------------------:|:----:|:----------:|:------------:|:--------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
|
||||||
| GLM-4-9B | Base | 8K | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B) | / |
|
| GLM-4-9B | Base | 8K | <= 4.45 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/glm-4-9b) | / |
|
||||||
| GLM-4-9B-Chat | Chat | 128K | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat) | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
|
| GLM-4-9B-Chat | Chat | 128K | <= 4.45 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat) | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
|
||||||
| GLM-4-9B-Chat-1M | Chat | 1M | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | / |
|
| GLM-4-9B-Chat-HF | Chat | 128K | >= 4.46 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-hf)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-hf) | [🤖 ModelScope CPU](https://modelscope.cn/studios/dash-infer/GLM-4-Chat-DashInfer-Demo/summary)<br> [🤖 ModelScope vLLM](https://modelscope.cn/studios/ZhipuAI/glm-4-9b-chat-vllm/summary) |
|
||||||
| GLM-4V-9B | Chat | 8K | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b) [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B) | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary) |
|
| GLM-4-9B-Chat-1M | Chat | 1M | <= 4.45 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4-9B-Chat-1M) | / |
|
||||||
|
| GLM-4-9B-Chat-1M-HF | Chat | 1M | >= 4.46 | <= 0.6.2 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-9b-chat-1m-hf)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m-hf) | / |
|
||||||
|
| GLM-4V-9B | Chat | 8K | >= 4.46 | >= 0.6.3 | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4v-9b)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)<br> [🟣 WiseModel](https://wisemodel.cn/models/ZhipuAI/GLM-4V-9B) | [🤖 ModelScope](https://modelscope.cn/studios/ZhipuAI/glm-4v-9b-Demo/summary) |
|
||||||
|
|
||||||
## BenchMark
|
|
||||||
|
## BenchMarkß
|
||||||
|
|
||||||
### Typical Tasks
|
### Typical Tasks
|
||||||
|
|
||||||
|
@ -168,7 +171,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
import os
|
import os
|
||||||
|
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Set the GPU number. If inference with multiple GPUs, set multiple GPU numbers
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Set the GPU number. If inference with multiple GPUs, set multiple GPU numbers
|
||||||
MODEL_PATH = "THUDM/glm-4-9b-chat"
|
MODEL_PATH = "THUDM/glm-4-9b-chat-hf"
|
||||||
|
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
@ -208,7 +211,7 @@ from vllm import LLM, SamplingParams
|
||||||
# GLM-4-9B-Chat
|
# GLM-4-9B-Chat
|
||||||
# If you encounter OOM, you can try to reduce max_model_len or increase tp_size
|
# If you encounter OOM, you can try to reduce max_model_len or increase tp_size
|
||||||
max_model_len, tp_size = 131072, 1
|
max_model_len, tp_size = 131072, 1
|
||||||
model_name = "THUDM/glm-4-9b-chat"
|
model_name = "THUDM/glm-4-9b-chat-hf"
|
||||||
prompt = [{"role": "user", "content": "你好"}]
|
prompt = [{"role": "user", "content": "你好"}]
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
|
|
@ -302,7 +302,10 @@ def generate_stream_glm4v(model: AutoModel, tokenizer: AutoTokenizer, params: di
|
||||||
inputs.append({"role": "user", "content": user_msg})
|
inputs.append({"role": "user", "content": user_msg})
|
||||||
if model_msg:
|
if model_msg:
|
||||||
inputs.append({"role": "assistant", "content": model_msg})
|
inputs.append({"role": "assistant", "content": model_msg})
|
||||||
|
if len(image_list) >= 1:
|
||||||
inputs.append({"role": "user", "content": query, "image": image_list[0]})
|
inputs.append({"role": "user", "content": query, "image": image_list[0]})
|
||||||
|
else:
|
||||||
|
inputs.append({"role": "user", "content": query})
|
||||||
|
|
||||||
model_inputs = tokenizer.apply_chat_template(
|
model_inputs = tokenizer.apply_chat_template(
|
||||||
inputs,
|
inputs,
|
||||||
|
|
|
@ -207,9 +207,6 @@ async def generate_stream_glm4(params):
|
||||||
"top_p": top_p,
|
"top_p": top_p,
|
||||||
"top_k": -1,
|
"top_k": -1,
|
||||||
"repetition_penalty": repetition_penalty,
|
"repetition_penalty": repetition_penalty,
|
||||||
"use_beam_search": False,
|
|
||||||
"length_penalty": 1,
|
|
||||||
"early_stopping": False,
|
|
||||||
"stop_token_ids": [151329, 151336, 151338],
|
"stop_token_ids": [151329, 151336, 151338],
|
||||||
"ignore_eos": False,
|
"ignore_eos": False,
|
||||||
"max_tokens": max_new_tokens,
|
"max_tokens": max_new_tokens,
|
||||||
|
@ -218,7 +215,7 @@ async def generate_stream_glm4(params):
|
||||||
"skip_special_tokens": True,
|
"skip_special_tokens": True,
|
||||||
}
|
}
|
||||||
sampling_params = SamplingParams(**params_dict)
|
sampling_params = SamplingParams(**params_dict)
|
||||||
async for output in engine.generate(inputs=inputs, sampling_params=sampling_params, request_id=f"{time.time()}"):
|
async for output in engine.generate(prompt=inputs, sampling_params=sampling_params, request_id=f"{time.time()}"):
|
||||||
output_len = len(output.outputs[0].token_ids)
|
output_len = len(output.outputs[0].token_ids)
|
||||||
input_len = len(output.prompt_token_ids)
|
input_len = len(output.prompt_token_ids)
|
||||||
ret = {
|
ret = {
|
||||||
|
|
|
@ -95,12 +95,12 @@ def function_chat(use_stream=False):
|
||||||
def simple_chat(use_stream=False):
|
def simple_chat(use_stream=False):
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "user",
|
||||||
"content": "请在你输出的时候都带上“喵喵喵”三个字,放在开头。",
|
"content": "请在你输出的时候都带上“喵喵喵”三个字,放在开头。",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "你是谁"
|
"content": "你是猫吗"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
|
@ -201,7 +201,12 @@ def glm4v_simple_image_chat(use_stream=False, img_path=None):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# Testing the text model
|
||||||
simple_chat(use_stream=False)
|
simple_chat(use_stream=False)
|
||||||
|
|
||||||
|
# Testing the text model with tools
|
||||||
# function_chat(use_stream=False)
|
# function_chat(use_stream=False)
|
||||||
|
|
||||||
|
# Testing images of multimodal models
|
||||||
# glm4v_simple_image_chat(use_stream=False, img_path="demo.jpg")
|
# glm4v_simple_image_chat(use_stream=False, img_path="demo.jpg")
|
||||||
|
|
||||||
|
|
|
@ -10,14 +10,14 @@ tiktoken>=0.7.0
|
||||||
numpy==1.26.4 # Need less than 2.0.0
|
numpy==1.26.4 # Need less than 2.0.0
|
||||||
accelerate>=1.0.1
|
accelerate>=1.0.1
|
||||||
sentence_transformers>=3.1.1
|
sentence_transformers>=3.1.1
|
||||||
gradio>=4.44.1 # web demo
|
gradio==4.44.1 # web demo
|
||||||
openai>=1.51.0 # openai demo
|
openai>=1.51.0 # openai demo
|
||||||
einops>=0.8.0
|
einops>=0.8.0
|
||||||
pillow>=10.4.0
|
pillow>=10.4.0
|
||||||
sse-starlette>=2.1.3
|
sse-starlette>=2.1.3
|
||||||
bitsandbytes>=0.43.3 # INT4 Loading
|
bitsandbytes>=0.43.3 # INT4 Loading
|
||||||
|
|
||||||
# vllm>=0.6.4 # using with VLLM Framework
|
# vllm>=0.6.3 # using with VLLM Framework
|
||||||
# flash-attn>=2.6.3 # using with flash-attention 2
|
# flash-attn>=2.6.3 # using with flash-attention 2
|
||||||
# PEFT model, not need if you don't use PEFT finetune model.
|
# PEFT model, not need if you don't use PEFT finetune model.
|
||||||
# peft>=0.13.0 # Using with finetune model
|
# peft>=0.13.0 # Using with finetune model
|
|
@ -29,6 +29,8 @@ tokenizer = AutoTokenizer.from_pretrained(
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
encode_special_tokens=True
|
encode_special_tokens=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
## For BF16 inference
|
||||||
model = AutoModel.from_pretrained(
|
model = AutoModel.from_pretrained(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
|
@ -37,7 +39,6 @@ model = AutoModel.from_pretrained(
|
||||||
device_map="auto",
|
device_map="auto",
|
||||||
).eval()
|
).eval()
|
||||||
|
|
||||||
|
|
||||||
## For INT4 inference
|
## For INT4 inference
|
||||||
# model = AutoModel.from_pretrained(
|
# model = AutoModel.from_pretrained(
|
||||||
# MODEL_PATH,
|
# MODEL_PATH,
|
||||||
|
|
|
@ -3,6 +3,9 @@ This script creates an interactive web demo for the GLM-4-9B model using Gradio,
|
||||||
a Python library for building quick and easy UI components for machine learning models.
|
a Python library for building quick and easy UI components for machine learning models.
|
||||||
It's designed to showcase the capabilities of the GLM-4-9B model in a user-friendly interface,
|
It's designed to showcase the capabilities of the GLM-4-9B model in a user-friendly interface,
|
||||||
allowing users to interact with the model through a chat-like interface.
|
allowing users to interact with the model through a chat-like interface.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Using with glm-4-9b-chat-hf will require `transformers>=4.46.0".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
@ -27,7 +30,7 @@ from transformers import (
|
||||||
ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
|
ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
|
||||||
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
||||||
|
|
||||||
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
|
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat-hf')
|
||||||
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
|
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ Usage:
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
- Gradio package
|
- Gradio package
|
||||||
- Type `pip install gradio` to install Gradio.
|
- Type `pip install gradio==4.44.1` to install Gradio.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
@ -18,7 +18,7 @@ from transformers import (
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
StoppingCriteria,
|
StoppingCriteria,
|
||||||
StoppingCriteriaList,
|
StoppingCriteriaList,
|
||||||
TextIteratorStreamer, AutoModel, BitsAndBytesConfig
|
TextIteratorStreamer, AutoModel
|
||||||
)
|
)
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import requests
|
import requests
|
||||||
|
|
Loading…
Reference in New Issue