Merge pull request #556 from sixsixcoder/main
update that multi-GPUs inference with transformers in glm-4 and glm-4v
This commit is contained in:
commit
81af3cfc5a
30
README.md
30
README.md
|
@ -146,10 +146,14 @@ GLM-4V-9B 是一个多模态语言模型,具备视觉理解能力,其相关
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
import os
|
||||||
|
|
||||||
device = "cuda"
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 设置 GPU 编号,如果单机单卡指定一个,单机多卡指定多个 GPU 编号
|
||||||
|
MODEL_PATH = "THUDM/glm-4-9b-chat"
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat", trust_remote_code=True)
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
||||||
|
|
||||||
query = "你好"
|
query = "你好"
|
||||||
|
|
||||||
|
@ -162,11 +166,12 @@ inputs = tokenizer.apply_chat_template([{"role": "user", "content": query}],
|
||||||
|
|
||||||
inputs = inputs.to(device)
|
inputs = inputs.to(device)
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
"THUDM/glm-4-9b-chat",
|
MODEL_PATH,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
low_cpu_mem_usage=True,
|
low_cpu_mem_usage=True,
|
||||||
trust_remote_code=True
|
trust_remote_code=True,
|
||||||
).to(device).eval()
|
device_map="auto"
|
||||||
|
).eval()
|
||||||
|
|
||||||
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
@ -216,10 +221,14 @@ print(outputs[0].outputs[0].text)
|
||||||
import torch
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
import os
|
||||||
|
|
||||||
device = "cuda"
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 设置 GPU 编号,如果单机单卡指定一个,单机多卡指定多个 GPU 编号
|
||||||
|
MODEL_PATH = "THUDM/glm-4v-9b"
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4v-9b", trust_remote_code=True)
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
||||||
|
|
||||||
query = '描述这张图片'
|
query = '描述这张图片'
|
||||||
image = Image.open("your image").convert('RGB')
|
image = Image.open("your image").convert('RGB')
|
||||||
|
@ -229,11 +238,12 @@ inputs = tokenizer.apply_chat_template([{"role": "user", "image": image, "conten
|
||||||
|
|
||||||
inputs = inputs.to(device)
|
inputs = inputs.to(device)
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
"THUDM/glm-4v-9b",
|
MODEL_PATH,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
low_cpu_mem_usage=True,
|
low_cpu_mem_usage=True,
|
||||||
trust_remote_code=True
|
trust_remote_code=True,
|
||||||
).to(device).eval()
|
device_map="auto"
|
||||||
|
).eval()
|
||||||
|
|
||||||
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
|
32
README_en.md
32
README_en.md
|
@ -163,10 +163,14 @@ Use the transformers backend for inference:
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
import os
|
||||||
|
|
||||||
device = "cuda"
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Set the GPU number. If inference with multiple GPUs, set multiple GPU numbers
|
||||||
|
MODEL_PATH = "THUDM/glm-4-9b-chat"
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat", trust_remote_code=True)
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
||||||
|
|
||||||
query = "你好"
|
query = "你好"
|
||||||
|
|
||||||
|
@ -179,11 +183,12 @@ inputs = tokenizer.apply_chat_template([{"role": "user", "content": query}],
|
||||||
|
|
||||||
inputs = inputs.to(device)
|
inputs = inputs.to(device)
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
"THUDM/glm-4-9b-chat",
|
MODEL_PATH,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
low_cpu_mem_usage=True,
|
low_cpu_mem_usage=True,
|
||||||
trust_remote_code=True
|
trust_remote_code=True,
|
||||||
).to(device).eval()
|
device_map="auto"
|
||||||
|
).eval()
|
||||||
|
|
||||||
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
@ -233,12 +238,16 @@ Use the transformers backend for inference:
|
||||||
import torch
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
import os
|
||||||
|
|
||||||
device = "cuda"
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Set the GPU number. If inference with multiple GPUs, set multiple GPU numbers
|
||||||
|
MODEL_PATH = "THUDM/glm-4v-9b"
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4v-9b", trust_remote_code=True)
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
query = 'display this image'
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
||||||
|
|
||||||
|
query = '描述这张图片'
|
||||||
image = Image.open("your image").convert('RGB')
|
image = Image.open("your image").convert('RGB')
|
||||||
inputs = tokenizer.apply_chat_template([{"role": "user", "image": image, "content": query}],
|
inputs = tokenizer.apply_chat_template([{"role": "user", "image": image, "content": query}],
|
||||||
add_generation_prompt=True, tokenize=True, return_tensors="pt",
|
add_generation_prompt=True, tokenize=True, return_tensors="pt",
|
||||||
|
@ -246,11 +255,12 @@ inputs = tokenizer.apply_chat_template([{"role": "user", "image": image, "conten
|
||||||
|
|
||||||
inputs = inputs.to(device)
|
inputs = inputs.to(device)
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
"THUDM/glm-4v-9b",
|
MODEL_PATH,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
low_cpu_mem_usage=True,
|
low_cpu_mem_usage=True,
|
||||||
trust_remote_code=True
|
trust_remote_code=True,
|
||||||
).to(device).eval()
|
device_map="auto"
|
||||||
|
).eval()
|
||||||
|
|
||||||
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
|
|
@ -17,8 +17,10 @@ from transformers import (
|
||||||
AutoModel,
|
AutoModel,
|
||||||
TextIteratorStreamer
|
TextIteratorStreamer
|
||||||
)
|
)
|
||||||
|
from peft import PeftModelForCausalLM
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
|
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
|
||||||
|
@ -365,16 +367,39 @@ torch.cuda.empty_cache()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
MODEL_PATH = sys.argv[1]
|
MODEL_PATH = sys.argv[1]
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
model_dir = Path(MODEL_PATH).expanduser().resolve()
|
||||||
|
if (model_dir / 'adapter_config.json').exists():
|
||||||
|
import json
|
||||||
|
with open(model_dir / 'adapter_config.json', 'r', encoding='utf-8') as file:
|
||||||
|
config = json.load(file)
|
||||||
|
model = AutoModel.from_pretrained(
|
||||||
|
config.get('base_model_name_or_path'),
|
||||||
|
trust_remote_code=True,
|
||||||
|
device_map='auto',
|
||||||
|
torch_dtype=TORCH_TYPE
|
||||||
|
)
|
||||||
|
model = PeftModelForCausalLM.from_pretrained(
|
||||||
|
model=model,
|
||||||
|
model_id=model_dir,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
config.get('base_model_name_or_path'),
|
||||||
|
trust_remote_code=True,
|
||||||
|
encode_special_tokens=True
|
||||||
|
)
|
||||||
|
model.eval().to(DEVICE)
|
||||||
|
else:
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
encode_special_tokens=True
|
encode_special_tokens=True
|
||||||
)
|
)
|
||||||
model = AutoModel.from_pretrained(
|
model = AutoModel.from_pretrained(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
torch_dtype=TORCH_TYPE,
|
torch_dtype=TORCH_TYPE,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
device_map="auto",
|
device_map="auto",
|
||||||
).eval().to(DEVICE)
|
).eval().to(DEVICE)
|
||||||
|
|
||||||
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
|
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
|
||||||
|
|
Loading…
Reference in New Issue