transforemrs>=4.46 support

This commit is contained in:
zR 2024-10-29 00:13:41 +08:00
parent 9cd635a825
commit c2c28bc45c
5 changed files with 24 additions and 37 deletions

View File

@ -11,7 +11,7 @@ Read this in [English](README_en.md)
## 项目更新 ## 项目更新
- 🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice) - 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
- 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持 - 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
- 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端 - 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
- 🔥 **News**: ```2024/09/05``` 我们开源了使LLMs能够在长上下文问答中生成细粒度引用的模型 [longcite-glm4-9b](https://huggingface.co/THUDM/LongCite-glm4-9b) - 🔥 **News**: ```2024/09/05``` 我们开源了使LLMs能够在长上下文问答中生成细粒度引用的模型 [longcite-glm4-9b](https://huggingface.co/THUDM/LongCite-glm4-9b)

View File

@ -9,6 +9,7 @@
## Update ## Update
- 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
- 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework. - 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
- 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model. - 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
- 🔥 **News**: ```2024/09/05```: We open-sourced a model enabling LLMs to generate fine-grained citations in - 🔥 **News**: ```2024/09/05```: We open-sourced a model enabling LLMs to generate fine-grained citations in

View File

@ -1,6 +1,6 @@
torch>=2.4.0 torch>=2.5.0
torchvision>=0.19.0 torchvision>=0.20.0
transformers>=4.45.0 transformers>=4.46.0
huggingface-hub>=0.25.1 huggingface-hub>=0.25.1
sentencepiece>=0.2.0 sentencepiece>=0.2.0
jinja2>=3.1.4 jinja2>=3.1.4
@ -8,7 +8,7 @@ pydantic>=2.9.2
timm>=1.0.9 timm>=1.0.9
tiktoken>=0.7.0 tiktoken>=0.7.0
numpy==1.26.4 # Need less than 2.0.0 numpy==1.26.4 # Need less than 2.0.0
accelerate>=0.34.0 accelerate>=1.0.1
sentence_transformers>=3.1.1 sentence_transformers>=3.1.1
gradio>=4.44.1 # web demo gradio>=4.44.1 # web demo
openai>=1.51.0 # openai demo openai>=1.51.0 # openai demo
@ -17,7 +17,7 @@ pillow>=10.4.0
sse-starlette>=2.1.3 sse-starlette>=2.1.3
bitsandbytes>=0.43.3 # INT4 Loading bitsandbytes>=0.43.3 # INT4 Loading
# vllm>=0.6.2 # using with VLLM Framework # vllm>=0.6.4 # using with VLLM Framework
# flash-attn>=2.6.3 # using with flash-attention 2 # flash-attn>=2.6.3 # using with flash-attention 2
# PEFT model, not need if you don't use PEFT finetune model. # PEFT model, not need if you don't use PEFT finetune model.
# peft>=0.13.0 # Using with finetune model # peft>=0.13.0 # Using with finetune model

View File

@ -15,35 +15,22 @@ If you use flash attention, you should install the flash-attn and add attn_impl
import os import os
import torch import torch
from threading import Thread from threading import Thread
from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel from transformers import (
AutoTokenizer,
StoppingCriteria,
StoppingCriteriaList,
TextIteratorStreamer,
GlmForCausalLM
)
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat') MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
## If use peft model. tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True):
# if (model_dir / 'adapter_config.json').exists():
# model = AutoModel.from_pretrained(
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
# )
# tokenizer_dir = model.peft_config['default'].base_model_name_or_path
# else:
# model = AutoModel.from_pretrained(
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
# )
# tokenizer_dir = model_dir
# tokenizer = AutoTokenizer.from_pretrained(
# tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False
# )
# return model, tokenizer
model = GlmForCausalLM.from_pretrained(
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,trust_remote_code=True)
model = AutoModel.from_pretrained(
MODEL_PATH, MODEL_PATH,
trust_remote_code=True,
# attn_implementation="flash_attention_2", # Use Flash Attention # attn_implementation="flash_attention_2", # Use Flash Attention
# torch_dtype=torch.bfloat16, #using flash-attn must use bfloat16 or float16 torch_dtype=torch.bfloat16, # using flash-attn must use bfloat16 or float16
device_map="auto").eval() device_map="auto").eval()
@ -83,6 +70,7 @@ if __name__ == "__main__":
messages, messages,
add_generation_prompt=True, add_generation_prompt=True,
tokenize=True, tokenize=True,
return_dict=True,
return_tensors="pt" return_tensors="pt"
).to(model.device) ).to(model.device)
streamer = TextIteratorStreamer( streamer = TextIteratorStreamer(
@ -92,7 +80,8 @@ if __name__ == "__main__":
skip_special_tokens=True skip_special_tokens=True
) )
generate_kwargs = { generate_kwargs = {
"input_ids": model_inputs, "input_ids": model_inputs["input_ids"],
"attention_mask": model_inputs["attention_mask"],
"streamer": streamer, "streamer": streamer,
"max_new_tokens": max_length, "max_new_tokens": max_length,
"do_sample": True, "do_sample": True,

View File

@ -11,7 +11,7 @@ ensuring that the CLI interface displays formatted text correctly.
""" """
import time import time
import asyncio import asyncio
from transformers import AutoTokenizer from transformers import PreTrainedTokenizer
from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine
from typing import List, Dict from typing import List, Dict
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
@ -20,13 +20,14 @@ MODEL_PATH = 'THUDM/glm-4-9b-chat'
LORA_PATH = '' LORA_PATH = ''
def load_model_and_tokenizer(model_dir: str, enable_lora: bool): def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
tokenizer = PreTrainedTokenizer.from_pretrained(model_dir),
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model_dir, model=model_dir,
tokenizer=model_dir, tokenizer=model_dir,
enable_lora=enable_lora, enable_lora=enable_lora,
tensor_parallel_size=1, tensor_parallel_size=1,
dtype="bfloat16", dtype="bfloat16",
trust_remote_code=True,
gpu_memory_utilization=0.9, gpu_memory_utilization=0.9,
enforce_eager=True, enforce_eager=True,
worker_use_ray=True, worker_use_ray=True,
@ -35,11 +36,7 @@ def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
# enable_chunked_prefill=True, # enable_chunked_prefill=True,
# max_num_batched_tokens=8192 # max_num_batched_tokens=8192
) )
tokenizer = AutoTokenizer.from_pretrained(
model_dir,
trust_remote_code=True,
encode_special_tokens=True
)
engine = AsyncLLMEngine.from_engine_args(engine_args) engine = AsyncLLMEngine.from_engine_args(engine_args)
return engine, tokenizer return engine, tokenizer