transforemrs>=4.46 support
This commit is contained in:
parent
9cd635a825
commit
c2c28bc45c
|
@ -11,7 +11,7 @@ Read this in [English](README_en.md)
|
|||
|
||||
## 项目更新
|
||||
|
||||
- 🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
|
||||
- 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
|
||||
- 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
|
||||
- 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
|
||||
- 🔥 **News**: ```2024/09/05``` 我们开源了使LLMs能够在长上下文问答中生成细粒度引用的模型 [longcite-glm4-9b](https://huggingface.co/THUDM/LongCite-glm4-9b)
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
## Update
|
||||
|
||||
- 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
|
||||
- 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
|
||||
- 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
|
||||
- 🔥 **News**: ```2024/09/05```: We open-sourced a model enabling LLMs to generate fine-grained citations in
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
torch>=2.4.0
|
||||
torchvision>=0.19.0
|
||||
transformers>=4.45.0
|
||||
torch>=2.5.0
|
||||
torchvision>=0.20.0
|
||||
transformers>=4.46.0
|
||||
huggingface-hub>=0.25.1
|
||||
sentencepiece>=0.2.0
|
||||
jinja2>=3.1.4
|
||||
|
@ -8,7 +8,7 @@ pydantic>=2.9.2
|
|||
timm>=1.0.9
|
||||
tiktoken>=0.7.0
|
||||
numpy==1.26.4 # Need less than 2.0.0
|
||||
accelerate>=0.34.0
|
||||
accelerate>=1.0.1
|
||||
sentence_transformers>=3.1.1
|
||||
gradio>=4.44.1 # web demo
|
||||
openai>=1.51.0 # openai demo
|
||||
|
@ -17,7 +17,7 @@ pillow>=10.4.0
|
|||
sse-starlette>=2.1.3
|
||||
bitsandbytes>=0.43.3 # INT4 Loading
|
||||
|
||||
# vllm>=0.6.2 # using with VLLM Framework
|
||||
# vllm>=0.6.4 # using with VLLM Framework
|
||||
# flash-attn>=2.6.3 # using with flash-attention 2
|
||||
# PEFT model, not need if you don't use PEFT finetune model.
|
||||
# peft>=0.13.0 # Using with finetune model
|
|
@ -15,35 +15,22 @@ If you use flash attention, you should install the flash-attn and add attn_impl
|
|||
import os
|
||||
import torch
|
||||
from threading import Thread
|
||||
from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
StoppingCriteria,
|
||||
StoppingCriteriaList,
|
||||
TextIteratorStreamer,
|
||||
GlmForCausalLM
|
||||
)
|
||||
|
||||
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
|
||||
|
||||
## If use peft model.
|
||||
# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True):
|
||||
# if (model_dir / 'adapter_config.json').exists():
|
||||
# model = AutoModel.from_pretrained(
|
||||
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
|
||||
# )
|
||||
# tokenizer_dir = model.peft_config['default'].base_model_name_or_path
|
||||
# else:
|
||||
# model = AutoModel.from_pretrained(
|
||||
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
|
||||
# )
|
||||
# tokenizer_dir = model_dir
|
||||
# tokenizer = AutoTokenizer.from_pretrained(
|
||||
# tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False
|
||||
# )
|
||||
# return model, tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,trust_remote_code=True)
|
||||
|
||||
model = AutoModel.from_pretrained(
|
||||
model = GlmForCausalLM.from_pretrained(
|
||||
MODEL_PATH,
|
||||
trust_remote_code=True,
|
||||
# attn_implementation="flash_attention_2", # Use Flash Attention
|
||||
# torch_dtype=torch.bfloat16, #using flash-attn must use bfloat16 or float16
|
||||
torch_dtype=torch.bfloat16, # using flash-attn must use bfloat16 or float16
|
||||
device_map="auto").eval()
|
||||
|
||||
|
||||
|
@ -83,6 +70,7 @@ if __name__ == "__main__":
|
|||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
streamer = TextIteratorStreamer(
|
||||
|
@ -92,7 +80,8 @@ if __name__ == "__main__":
|
|||
skip_special_tokens=True
|
||||
)
|
||||
generate_kwargs = {
|
||||
"input_ids": model_inputs,
|
||||
"input_ids": model_inputs["input_ids"],
|
||||
"attention_mask": model_inputs["attention_mask"],
|
||||
"streamer": streamer,
|
||||
"max_new_tokens": max_length,
|
||||
"do_sample": True,
|
||||
|
|
|
@ -11,7 +11,7 @@ ensuring that the CLI interface displays formatted text correctly.
|
|||
"""
|
||||
import time
|
||||
import asyncio
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import PreTrainedTokenizer
|
||||
from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine
|
||||
from typing import List, Dict
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
@ -20,13 +20,14 @@ MODEL_PATH = 'THUDM/glm-4-9b-chat'
|
|||
LORA_PATH = ''
|
||||
|
||||
def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
|
||||
tokenizer = PreTrainedTokenizer.from_pretrained(model_dir),
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model_dir,
|
||||
tokenizer=model_dir,
|
||||
enable_lora=enable_lora,
|
||||
tensor_parallel_size=1,
|
||||
dtype="bfloat16",
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.9,
|
||||
enforce_eager=True,
|
||||
worker_use_ray=True,
|
||||
|
@ -35,11 +36,7 @@ def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
|
|||
# enable_chunked_prefill=True,
|
||||
# max_num_batched_tokens=8192
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_dir,
|
||||
trust_remote_code=True,
|
||||
encode_special_tokens=True
|
||||
)
|
||||
|
||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
return engine, tokenizer
|
||||
|
||||
|
|
Loading…
Reference in New Issue