transforemrs>=4.46 support
This commit is contained in:
parent
9cd635a825
commit
c2c28bc45c
|
@ -11,7 +11,7 @@ Read this in [English](README_en.md)
|
||||||
|
|
||||||
## 项目更新
|
## 项目更新
|
||||||
|
|
||||||
- 🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
|
- 🔥🔥 **News**: ```2024/10/25```: 我们开源了端到端中英语音对话模型 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice)
|
||||||
- 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
|
- 🔥 **News**: ```2024/10/12```: 增加了 GLM-4v-9B 模型对vllm框架的支持
|
||||||
- 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
|
- 🔥 **News**: ```2024/09/06```: 增加了在 GLM-4v-9B 模型上构建OpenAI API兼容的服务端
|
||||||
- 🔥 **News**: ```2024/09/05``` 我们开源了使LLMs能够在长上下文问答中生成细粒度引用的模型 [longcite-glm4-9b](https://huggingface.co/THUDM/LongCite-glm4-9b)
|
- 🔥 **News**: ```2024/09/05``` 我们开源了使LLMs能够在长上下文问答中生成细粒度引用的模型 [longcite-glm4-9b](https://huggingface.co/THUDM/LongCite-glm4-9b)
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
|
|
||||||
## Update
|
## Update
|
||||||
|
|
||||||
|
- 🔥🔥 **News**: ```2024/10/25```: We have open-sourced the end-to-end Chinese-English voice dialogue model [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
|
||||||
- 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
|
- 🔥 **News**: ```2024/10/12```: Add GLM-4v-9B model support for vllm framework.
|
||||||
- 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
|
- 🔥 **News**: ```2024/09/06```: Add support for OpenAI API server on the GLM-4v-9B model.
|
||||||
- 🔥 **News**: ```2024/09/05```: We open-sourced a model enabling LLMs to generate fine-grained citations in
|
- 🔥 **News**: ```2024/09/05```: We open-sourced a model enabling LLMs to generate fine-grained citations in
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
torch>=2.4.0
|
torch>=2.5.0
|
||||||
torchvision>=0.19.0
|
torchvision>=0.20.0
|
||||||
transformers>=4.45.0
|
transformers>=4.46.0
|
||||||
huggingface-hub>=0.25.1
|
huggingface-hub>=0.25.1
|
||||||
sentencepiece>=0.2.0
|
sentencepiece>=0.2.0
|
||||||
jinja2>=3.1.4
|
jinja2>=3.1.4
|
||||||
|
@ -8,7 +8,7 @@ pydantic>=2.9.2
|
||||||
timm>=1.0.9
|
timm>=1.0.9
|
||||||
tiktoken>=0.7.0
|
tiktoken>=0.7.0
|
||||||
numpy==1.26.4 # Need less than 2.0.0
|
numpy==1.26.4 # Need less than 2.0.0
|
||||||
accelerate>=0.34.0
|
accelerate>=1.0.1
|
||||||
sentence_transformers>=3.1.1
|
sentence_transformers>=3.1.1
|
||||||
gradio>=4.44.1 # web demo
|
gradio>=4.44.1 # web demo
|
||||||
openai>=1.51.0 # openai demo
|
openai>=1.51.0 # openai demo
|
||||||
|
@ -17,7 +17,7 @@ pillow>=10.4.0
|
||||||
sse-starlette>=2.1.3
|
sse-starlette>=2.1.3
|
||||||
bitsandbytes>=0.43.3 # INT4 Loading
|
bitsandbytes>=0.43.3 # INT4 Loading
|
||||||
|
|
||||||
# vllm>=0.6.2 # using with VLLM Framework
|
# vllm>=0.6.4 # using with VLLM Framework
|
||||||
# flash-attn>=2.6.3 # using with flash-attention 2
|
# flash-attn>=2.6.3 # using with flash-attention 2
|
||||||
# PEFT model, not need if you don't use PEFT finetune model.
|
# PEFT model, not need if you don't use PEFT finetune model.
|
||||||
# peft>=0.13.0 # Using with finetune model
|
# peft>=0.13.0 # Using with finetune model
|
|
@ -15,35 +15,22 @@ If you use flash attention, you should install the flash-attn and add attn_impl
|
||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel
|
from transformers import (
|
||||||
|
AutoTokenizer,
|
||||||
|
StoppingCriteria,
|
||||||
|
StoppingCriteriaList,
|
||||||
|
TextIteratorStreamer,
|
||||||
|
GlmForCausalLM
|
||||||
|
)
|
||||||
|
|
||||||
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
|
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
|
||||||
|
|
||||||
## If use peft model.
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
|
||||||
# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True):
|
|
||||||
# if (model_dir / 'adapter_config.json').exists():
|
|
||||||
# model = AutoModel.from_pretrained(
|
|
||||||
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
|
|
||||||
# )
|
|
||||||
# tokenizer_dir = model.peft_config['default'].base_model_name_or_path
|
|
||||||
# else:
|
|
||||||
# model = AutoModel.from_pretrained(
|
|
||||||
# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
|
|
||||||
# )
|
|
||||||
# tokenizer_dir = model_dir
|
|
||||||
# tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
# tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False
|
|
||||||
# )
|
|
||||||
# return model, tokenizer
|
|
||||||
|
|
||||||
|
model = GlmForCausalLM.from_pretrained(
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,trust_remote_code=True)
|
|
||||||
|
|
||||||
model = AutoModel.from_pretrained(
|
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
trust_remote_code=True,
|
|
||||||
# attn_implementation="flash_attention_2", # Use Flash Attention
|
# attn_implementation="flash_attention_2", # Use Flash Attention
|
||||||
# torch_dtype=torch.bfloat16, #using flash-attn must use bfloat16 or float16
|
torch_dtype=torch.bfloat16, # using flash-attn must use bfloat16 or float16
|
||||||
device_map="auto").eval()
|
device_map="auto").eval()
|
||||||
|
|
||||||
|
|
||||||
|
@ -83,6 +70,7 @@ if __name__ == "__main__":
|
||||||
messages,
|
messages,
|
||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
tokenize=True,
|
tokenize=True,
|
||||||
|
return_dict=True,
|
||||||
return_tensors="pt"
|
return_tensors="pt"
|
||||||
).to(model.device)
|
).to(model.device)
|
||||||
streamer = TextIteratorStreamer(
|
streamer = TextIteratorStreamer(
|
||||||
|
@ -92,7 +80,8 @@ if __name__ == "__main__":
|
||||||
skip_special_tokens=True
|
skip_special_tokens=True
|
||||||
)
|
)
|
||||||
generate_kwargs = {
|
generate_kwargs = {
|
||||||
"input_ids": model_inputs,
|
"input_ids": model_inputs["input_ids"],
|
||||||
|
"attention_mask": model_inputs["attention_mask"],
|
||||||
"streamer": streamer,
|
"streamer": streamer,
|
||||||
"max_new_tokens": max_length,
|
"max_new_tokens": max_length,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
|
|
|
@ -11,7 +11,7 @@ ensuring that the CLI interface displays formatted text correctly.
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
import asyncio
|
import asyncio
|
||||||
from transformers import AutoTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine
|
from vllm import SamplingParams, AsyncEngineArgs, AsyncLLMEngine
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
@ -20,13 +20,14 @@ MODEL_PATH = 'THUDM/glm-4-9b-chat'
|
||||||
LORA_PATH = ''
|
LORA_PATH = ''
|
||||||
|
|
||||||
def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
|
def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
|
||||||
|
tokenizer = PreTrainedTokenizer.from_pretrained(model_dir),
|
||||||
|
|
||||||
engine_args = AsyncEngineArgs(
|
engine_args = AsyncEngineArgs(
|
||||||
model=model_dir,
|
model=model_dir,
|
||||||
tokenizer=model_dir,
|
tokenizer=model_dir,
|
||||||
enable_lora=enable_lora,
|
enable_lora=enable_lora,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
trust_remote_code=True,
|
|
||||||
gpu_memory_utilization=0.9,
|
gpu_memory_utilization=0.9,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
worker_use_ray=True,
|
worker_use_ray=True,
|
||||||
|
@ -35,11 +36,7 @@ def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
|
||||||
# enable_chunked_prefill=True,
|
# enable_chunked_prefill=True,
|
||||||
# max_num_batched_tokens=8192
|
# max_num_batched_tokens=8192
|
||||||
)
|
)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
model_dir,
|
|
||||||
trust_remote_code=True,
|
|
||||||
encode_special_tokens=True
|
|
||||||
)
|
|
||||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
return engine, tokenizer
|
return engine, tokenizer
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue