fix issue #74

2024-06-06 16:18:14 +08:00 · 2024-06-06 16:18:14 +08:00 · ce2667cf5d
parent 8102212b9f
commit ce2667cf5d
6 changed files with 44 additions and 39 deletions
--- a/README.md
+++ b/README.md
@ -19,7 +19,7 @@ GLM-4V-9B。**GLM-4V-9B** 具备 1120 * 1120 高分辨率下的中英双语多
 表现出超越 GPT-4-turbo-2024-04-09、Gemini
 1.0 Pro、Qwen-VL-Max 和 Claude 3 Opus 的卓越性能。

-## 模型列表
+## Model List

 | Model            | Type | Seq Length | Download                                                                                                                                | Online Demo                                                                                                                                                                                |
 |------------------|------|------------|-----------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
--- a/README_en.md
+++ b/README_en.md
@ -93,6 +93,7 @@ on [Berkeley Function Calling Leaderboard](https://github.com/ShishirPatil/goril
 | ChatGLM3-6B            |    57.88     |    62.18    |    69.78     |   5.42    |
 | GLM-4-9B-Chat          |    81.00     |    80.26    |    84.40     |   87.92   |

+
 ### Multi-Modal

 GLM-4V-9B is a multimodal language model with visual understanding capabilities. The evaluation results of its related
@ -114,7 +115,7 @@ classic tasks are as follows:

 ## Quick call

-**硬件配置和系统要求，请查看[这里](basic_demo/README_en.md)。**
+**For hardware configuration and system requirements, please check [here](basic_demo/README_en.md). **

 ### Use the following method to quickly call the GLM-4-9B-Chat language model

--- a/basic_demo/requirements.txt
+++ b/basic_demo/requirements.txt
@ -21,4 +21,7 @@ einops>=0.7.0
 sse-starlette>=2.1.0

 # INT4
-bitsandbytes>=0.43.1
+bitsandbytes>=0.43.1
+
+# PEFT model, not need if you don't use PEFT finetune model.
+# peft>=0.11.0
--- a/basic_demo/trans_cli_demo.py
+++ b/basic_demo/trans_cli_demo.py
@ -13,45 +13,38 @@ ensuring that the CLI interface displays formatted text correctly.
 import os
 import torch
 from threading import Thread
-from typing import Union
-from pathlib import Path
-from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-    StoppingCriteria,
-    StoppingCriteriaList,
-    TextIteratorStreamer
-)
-
-ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
-TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel

 MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')

-
-def load_model_and_tokenizer(
-        model_dir: Union[str, Path], trust_remote_code: bool = True
-) -> tuple[ModelType, TokenizerType]:
-    model_dir = Path(model_dir).expanduser().resolve()
-    if (model_dir / 'adapter_config.json').exists():
-        model = AutoPeftModelForCausalLM.from_pretrained(
-            model_dir, trust_remote_code=trust_remote_code, device_map='auto')
-        tokenizer_dir = model.peft_config['default'].base_model_name_or_path
-    else:
-        model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=trust_remote_code, device_map='auto')
-        tokenizer_dir = model_dir
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_dir, trust_remote_code=trust_remote_code, encode_special_tokens=True, use_fast=False
-    )
-    return model, tokenizer
+## If use peft model.
+# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True):
+#     if (model_dir / 'adapter_config.json').exists():
+#         model = AutoModel.from_pretrained(
+#             model_dir, trust_remote_code=trust_remote_code, device_map='auto'
+#         )
+#         tokenizer_dir = model.peft_config['default'].base_model_name_or_path
+#     else:
+#         model = AutoModel.from_pretrained(
+#             model_dir, trust_remote_code=trust_remote_code, device_map='auto'
+#         )
+#         tokenizer_dir = model_dir
+#     tokenizer = AutoTokenizer.from_pretrained(
+#         tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False
+#     )
+#     return model, tokenizer


-model, tokenizer = load_model_and_tokenizer(MODEL_PATH, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    encode_special_tokens=True
+)
+model = AutoModel.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    device_map="auto",
+    torch_dtype=torch.bfloat16).eval()


 class StopOnTokens(StoppingCriteria):
--- a/basic_demo/trans_stress_test.py
+++ b/basic_demo/trans_stress_test.py
@ -17,10 +17,17 @@ def stress_test(token_len, n, num_gpu):
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        trust_remote_code=True,
-        # quantization_config=BitsAndBytesConfig(load_in_4bit=True),
-        # low_cpu_mem_usage=True,
        torch_dtype=torch.bfloat16
    ).to(device).eval()
+    
+    # Use INT4 weight infer
+    # model = AutoModelForCausalLM.from_pretrained(
+    #     MODEL_PATH,
+    #     trust_remote_code=True,
+    #     quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+    #     low_cpu_mem_usage=True,
+    # ).eval()
+
    times = []
    decode_times = []

--- a/composite_demo/src/utils.py
+++ b/composite_demo/src/utils.py
@ -18,6 +18,7 @@ def extract_docx(path):
    for paragraph in doc.paragraphs:
        data.append(paragraph.text)
    content = '\n\n'.join(data)
+    return content

 def extract_pptx(path):
    prs = Presentation(path)