diff --git a/basic_demo/glm_server.py b/basic_demo/glm_server.py index 966648e..2ae8b22 100644 --- a/basic_demo/glm_server.py +++ b/basic_demo/glm_server.py @@ -673,7 +673,6 @@ if __name__ == "__main__": gpu_memory_utilization=0.9, enforce_eager=True, worker_use_ray=False, - engine_use_ray=False, disable_log_requests=True, max_model_len=MAX_MODEL_LENGTH, ) diff --git a/basic_demo/requirements.txt b/basic_demo/requirements.txt index 26f6115..16ba7f0 100644 --- a/basic_demo/requirements.txt +++ b/basic_demo/requirements.txt @@ -1,23 +1,23 @@ torch>=2.4.0 torchvision>=0.19.0 -transformers==4.44.0 -huggingface-hub>=0.24.5 +transformers>=4.45.0 +huggingface-hub>=0.25.1 sentencepiece>=0.2.0 jinja2>=3.1.4 -pydantic>=2.8.2 -timm>=1.0.8 +pydantic>=2.9.2 +timm>=1.0.9 tiktoken>=0.7.0 numpy==1.26.4 # Need less than 2.0.0 -accelerate>=0.33.0 -sentence_transformers>=3.0.1 -gradio>=4.42.0 # web demo -openai>=1.43.0 # openai demo +accelerate>=0.34.0 +sentence_transformers>=3.1.1 +gradio>=4.44.1 # web demo +openai>=1.51.0 # openai demo einops>=0.8.0 pillow>=10.4.0 sse-starlette>=2.1.3 bitsandbytes>=0.43.3 # INT4 Loading -# vllm==0.5.4 # using with VLLM Framework -# flash-attn>=2.6.1 # using with flash-attention 2 +# vllm>=0.6.2 # using with VLLM Framework +# flash-attn>=2.6.3 # using with flash-attention 2 # PEFT model, not need if you don't use PEFT finetune model. -# peft>=0.12.2 # Using with finetune model \ No newline at end of file +# peft>=0.13.0 # Using with finetune model \ No newline at end of file diff --git a/basic_demo/trans_cli_demo.py b/basic_demo/trans_cli_demo.py index cbd0ba0..935a86c 100644 --- a/basic_demo/trans_cli_demo.py +++ b/basic_demo/trans_cli_demo.py @@ -37,11 +37,7 @@ MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat') # return model, tokenizer -tokenizer = AutoTokenizer.from_pretrained( - MODEL_PATH, - trust_remote_code=True, - encode_special_tokens=True -) +tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,trust_remote_code=True) model = AutoModel.from_pretrained( MODEL_PATH, diff --git a/basic_demo/vllm_cli_demo.py b/basic_demo/vllm_cli_demo.py index 24da1d4..7810fd8 100644 --- a/basic_demo/vllm_cli_demo.py +++ b/basic_demo/vllm_cli_demo.py @@ -30,7 +30,6 @@ def load_model_and_tokenizer(model_dir: str, enable_lora: bool): gpu_memory_utilization=0.9, enforce_eager=True, worker_use_ray=True, - engine_use_ray=False, disable_log_requests=True # 如果遇见 OOM 现象,建议开启下述参数 # enable_chunked_prefill=True,