transforemrs>=4.46 support

This commit is contained in:
zR 2024-10-29 00:13:41 +08:00
parent 9cd635a825
commit c2c28bc45c
5 changed files with 24 additions and 37 deletions

View File

@ -11,7 +11,7 @@ Read this in [English](README_en.md)
## 项目更新
- 🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
- 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
- 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
- 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
- 🔥 **News**: ```2024/09/05``` 我们开源了使LLMs能够在长上下文问答中生成细粒度引用的模型 [longcite-glm4-9b](https://huggingface.co/THUDM/LongCite-glm4-9b)

View File

@ -9,6 +9,7 @@
## Update
- 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
- 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
- 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
- 🔥 **News**: ```2024/09/05```: We open-sourced a model enabling LLMs to generate fine-grained citations in

View File

@ -1,6 +1,6 @@
torch>=2.4.0
torchvision>=0.19.0
transformers>=4.45.0
torch>=2.5.0
torchvision>=0.20.0
transformers>=4.46.0
huggingface-hub>=0.25.1
sentencepiece>=0.2.0
jinja2>=3.1.4
@ -8,7 +8,7 @@ pydantic>=2.9.2
timm>=1.0.9
tiktoken>=0.7.0
numpy==1.26.4 # Need less than 2.0.0
accelerate>=0.34.0
accelerate>=1.0.1
sentence_transformers>=3.1.1
gradio>=4.44.1 # web demo
openai>=1.51.0 # openai demo
@ -17,7 +17,7 @@ pillow>=10.4.0
sse-starlette>=2.1.3
bitsandbytes>=0.43.3 # INT4 Loading
# vllm>=0.6.2 # using with VLLM Framework
# vllm>=0.6.4 # using with VLLM Framework
# flash-attn>=2.6.3 # using with flash-attention 2
# PEFT model, not need if you don't use PEFT finetune model.
# peft>=0.13.0 # Using with finetune model

View File

@ -15,35 +15,22 @@ If you use flash attention, you should install the flash-attn and add attn_impl
import os
import torch
from threading import Thread
from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel
from transformers import (
AutoTokenizer,
StoppingCriteria,
StoppingCriteriaList,
TextIteratorStreamer,
GlmForCausalLM
)
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
## If use peft model.
# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True):
# if (model_dir / 'adapter_config.json').exists():
# model = AutoModel.from_pretrained(
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
# )
# tokenizer_dir = model.peft_config['default'].base_model_name_or_path
# else:
# model = AutoModel.from_pretrained(
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
# )
# tokenizer_dir = model_dir
# tokenizer = AutoTokenizer.from_pretrained(
# tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False
# )
# return model, tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,trust_remote_code=True)
model = AutoModel.from_pretrained(
model = GlmForCausalLM.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
# attn_implementation="flash_attention_2", # Use Flash Attention
# torch_dtype=torch.bfloat16, #using flash-attn must use bfloat16 or float16
torch_dtype=torch.bfloat16, # using flash-attn must use bfloat16 or float16
device_map="auto").eval()
@ -83,6 +70,7 @@ if __name__ == "__main__":
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(model.device)
streamer = TextIteratorStreamer(
@ -92,7 +80,8 @@ if __name__ == "__main__":
skip_special_tokens=True
)
generate_kwargs = {
"input_ids": model_inputs,
"input_ids": model_inputs["input_ids"],
"attention_mask": model_inputs["attention_mask"],
"streamer": streamer,
"max_new_tokens": max_length,
"do_sample": True,

View File

@ -11,7 +11,7 @@ ensuring that the CLI interface displays formatted text correctly.
"""
import time
import asyncio
from transformers import AutoTokenizer
from transformers import PreTrainedTokenizer
from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine
from typing import List, Dict
from vllm.lora.request import LoRARequest
@ -20,13 +20,14 @@ MODEL_PATH = 'THUDM/glm-4-9b-chat'
LORA_PATH = ''
def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
tokenizer = PreTrainedTokenizer.from_pretrained(model_dir),
engine_args = AsyncEngineArgs(
model=model_dir,
tokenizer=model_dir,
enable_lora=enable_lora,
tensor_parallel_size=1,
dtype="bfloat16",
trust_remote_code=True,
gpu_memory_utilization=0.9,
enforce_eager=True,
worker_use_ray=True,
@ -35,11 +36,7 @@ def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
# enable_chunked_prefill=True,
# max_num_batched_tokens=8192
)
tokenizer = AutoTokenizer.from_pretrained(
model_dir,
trust_remote_code=True,
encode_special_tokens=True
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
return engine, tokenizer