From c2c28bc45c308cabd87541f0f18eb37fe775cec1 Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Tue, 29 Oct 2024 00:13:41 +0800 Subject: [PATCH] transforemrs>=4.46 support --- README.md | 2 +- README_en.md | 1 + basic_demo/requirements.txt | 10 +++++----- basic_demo/trans_cli_demo.py | 37 +++++++++++++----------------------- basic_demo/vllm_cli_demo.py | 11 ++++------- 5 files changed, 24 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 10855b1..b82ef4b 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Read this in [English](README_en.md) ## 项目更新 -- 🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice) +- 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice) - 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持 - 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端 - 🔥 **News**: ```2024/09/05``` 我们开源了使LLMs能够在长上下文问答中生成细粒度引用的模型 [longcite-glm4-9b](https://huggingface.co/THUDM/LongCite-glm4-9b) diff --git a/README_en.md b/README_en.md index 3a339b3..4719a3e 100644 --- a/README_en.md +++ b/README_en.md @@ -9,6 +9,7 @@ ## Update +- 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice). - 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework. - 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model. - 🔥 **News**: ```2024/09/05```: We open-sourced a model enabling LLMs to generate fine-grained citations in diff --git a/basic_demo/requirements.txt b/basic_demo/requirements.txt index 16ba7f0..4ff1483 100644 --- a/basic_demo/requirements.txt +++ b/basic_demo/requirements.txt @@ -1,6 +1,6 @@ -torch>=2.4.0 -torchvision>=0.19.0 -transformers>=4.45.0 +torch>=2.5.0 +torchvision>=0.20.0 +transformers>=4.46.0 huggingface-hub>=0.25.1 sentencepiece>=0.2.0 jinja2>=3.1.4 @@ -8,7 +8,7 @@ pydantic>=2.9.2 timm>=1.0.9 tiktoken>=0.7.0 numpy==1.26.4 # Need less than 2.0.0 -accelerate>=0.34.0 +accelerate>=1.0.1 sentence_transformers>=3.1.1 gradio>=4.44.1 # web demo openai>=1.51.0 # openai demo @@ -17,7 +17,7 @@ pillow>=10.4.0 sse-starlette>=2.1.3 bitsandbytes>=0.43.3 # INT4 Loading -# vllm>=0.6.2 # using with VLLM Framework +# vllm>=0.6.4 # using with VLLM Framework # flash-attn>=2.6.3 # using with flash-attention 2 # PEFT model, not need if you don't use PEFT finetune model. # peft>=0.13.0 # Using with finetune model \ No newline at end of file diff --git a/basic_demo/trans_cli_demo.py b/basic_demo/trans_cli_demo.py index 935a86c..2b1d945 100644 --- a/basic_demo/trans_cli_demo.py +++ b/basic_demo/trans_cli_demo.py @@ -15,35 +15,22 @@ If you use flash attention, you should install the flash-attn and add attn_impl import os import torch from threading import Thread -from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel +from transformers import ( + AutoTokenizer, + StoppingCriteria, + StoppingCriteriaList, + TextIteratorStreamer, + GlmForCausalLM +) MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat') -## If use peft model. -# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True): -# if (model_dir / 'adapter_config.json').exists(): -# model = AutoModel.from_pretrained( -# model_dir, trust_remote_code=trust_remote_code, device_map='auto' -# ) -# tokenizer_dir = model.peft_config['default'].base_model_name_or_path -# else: -# model = AutoModel.from_pretrained( -# model_dir, trust_remote_code=trust_remote_code, device_map='auto' -# ) -# tokenizer_dir = model_dir -# tokenizer = AutoTokenizer.from_pretrained( -# tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False -# ) -# return model, tokenizer +tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) - -tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,trust_remote_code=True) - -model = AutoModel.from_pretrained( +model = GlmForCausalLM.from_pretrained( MODEL_PATH, - trust_remote_code=True, # attn_implementation="flash_attention_2", # Use Flash Attention - # torch_dtype=torch.bfloat16, #using flash-attn must use bfloat16 or float16 + torch_dtype=torch.bfloat16, # using flash-attn must use bfloat16 or float16 device_map="auto").eval() @@ -83,6 +70,7 @@ if __name__ == "__main__": messages, add_generation_prompt=True, tokenize=True, + return_dict=True, return_tensors="pt" ).to(model.device) streamer = TextIteratorStreamer( @@ -92,7 +80,8 @@ if __name__ == "__main__": skip_special_tokens=True ) generate_kwargs = { - "input_ids": model_inputs, + "input_ids": model_inputs["input_ids"], + "attention_mask": model_inputs["attention_mask"], "streamer": streamer, "max_new_tokens": max_length, "do_sample": True, diff --git a/basic_demo/vllm_cli_demo.py b/basic_demo/vllm_cli_demo.py index 7810fd8..1dd159c 100644 --- a/basic_demo/vllm_cli_demo.py +++ b/basic_demo/vllm_cli_demo.py @@ -11,7 +11,7 @@ ensuring that the CLI interface displays formatted text correctly. """ import time import asyncio -from transformers import AutoTokenizer +from transformers import PreTrainedTokenizer from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine from typing import List, Dict from vllm.lora.request import LoRARequest @@ -20,13 +20,14 @@ MODEL_PATH = 'THUDM/glm-4-9b-chat' LORA_PATH = '' def load_model_and_tokenizer(model_dir: str, enable_lora: bool): + tokenizer = PreTrainedTokenizer.from_pretrained(model_dir), + engine_args = AsyncEngineArgs( model=model_dir, tokenizer=model_dir, enable_lora=enable_lora, tensor_parallel_size=1, dtype="bfloat16", - trust_remote_code=True, gpu_memory_utilization=0.9, enforce_eager=True, worker_use_ray=True, @@ -35,11 +36,7 @@ def load_model_and_tokenizer(model_dir: str, enable_lora: bool): # enable_chunked_prefill=True, # max_num_batched_tokens=8192 ) - tokenizer = AutoTokenizer.from_pretrained( - model_dir, - trust_remote_code=True, - encode_special_tokens=True - ) + engine = AsyncLLMEngine.from_engine_args(engine_args) return engine, tokenizer