fix issue #74

2024-06-06 16:18:14 +08:00 · 2024-06-06 16:18:14 +08:00 · ce2667cf5d
parent 8102212b9f
commit ce2667cf5d
6 changed files with 44 additions and 39 deletions
--- a/README.md
+++ b/README.md
@ -19,7 +19,7 @@ GLM-4V-9B。**GLM-4V-9B** 具备 1120 * 1120 高分辨率下的中英双语多
 表现出超越 GPT-4-turbo-2024-04-09、Gemini
 1.0 Pro、Qwen-VL-Max 和 Claude 3 Opus 的卓越性能。
-## 模型列表
+## Model List
 | Model            | Type | Seq Length | Download                                                                                                                                | Online Demo                                                                                                                                                                                |
 |------------------|------|------------|-----------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
--- a/README_en.md
+++ b/README_en.md
@ -93,6 +93,7 @@ on [Berkeley Function Calling Leaderboard](https://github.com/ShishirPatil/goril
 | ChatGLM3-6B            |    57.88     |    62.18    |    69.78     |   5.42    |
 | GLM-4-9B-Chat          |    81.00     |    80.26    |    84.40     |   87.92   |
 ### Multi-Modal
 GLM-4V-9B is a multimodal language model with visual understanding capabilities. The evaluation results of its related
@ -114,7 +115,7 @@ classic tasks are as follows:
 ## Quick call
-**硬件配置和系统要求，请查看[这里](basic_demo/README_en.md)。**
+**For hardware configuration and system requirements, please check [here](basic_demo/README_en.md). **
 ### Use the following method to quickly call the GLM-4-9B-Chat language model
--- a/basic_demo/requirements.txt
+++ b/basic_demo/requirements.txt
@ -21,4 +21,7 @@ einops>=0.7.0
 sse-starlette>=2.1.0
 # INT4
-bitsandbytes>=0.43.1
+bitsandbytes>=0.43.1
 # PEFT model, not need if you don't use PEFT finetune model.
 # peft>=0.11.0
--- a/basic_demo/trans_cli_demo.py
+++ b/basic_demo/trans_cli_demo.py
@ -13,45 +13,38 @@ ensuring that the CLI interface displays formatted text correctly.
 import os
 import torch
 from threading import Thread
-from typing import Union
+from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel
 from pathlib import Path
 from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
 from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
    StoppingCriteria,
    StoppingCriteriaList,
    TextIteratorStreamer
 )
 ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
 TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
-
+## If use peft model.
-def load_model_and_tokenizer(
+# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True):
-        model_dir: Union[str, Path], trust_remote_code: bool = True
+#     if (model_dir / 'adapter_config.json').exists():
-) -> tuple[ModelType, TokenizerType]:
+#         model = AutoModel.from_pretrained(
-    model_dir = Path(model_dir).expanduser().resolve()
+#             model_dir, trust_remote_code=trust_remote_code, device_map='auto'
-    if (model_dir / 'adapter_config.json').exists():
+#         )
-        model = AutoPeftModelForCausalLM.from_pretrained(
+#         tokenizer_dir = model.peft_config['default'].base_model_name_or_path
-            model_dir, trust_remote_code=trust_remote_code, device_map='auto')
+#     else:
-        tokenizer_dir = model.peft_config['default'].base_model_name_or_path
+#         model = AutoModel.from_pretrained(
-    else:
+#             model_dir, trust_remote_code=trust_remote_code, device_map='auto'
-        model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=trust_remote_code, device_map='auto')
+#         )
-        tokenizer_dir = model_dir
+#         tokenizer_dir = model_dir
-
+#     tokenizer = AutoTokenizer.from_pretrained(
-    tokenizer = AutoTokenizer.from_pretrained(
+#         tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False
-        tokenizer_dir, trust_remote_code=trust_remote_code, encode_special_tokens=True, use_fast=False
+#     )
-    )
+#     return model, tokenizer
    return model, tokenizer
-model, tokenizer = load_model_and_tokenizer(MODEL_PATH, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    encode_special_tokens=True
 )
 model = AutoModel.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.bfloat16).eval()
 class StopOnTokens(StoppingCriteria):
--- a/basic_demo/trans_stress_test.py
+++ b/basic_demo/trans_stress_test.py
@ -17,10 +17,17 @@ def stress_test(token_len, n, num_gpu):
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        trust_remote_code=True,
        # quantization_config=BitsAndBytesConfig(load_in_4bit=True),
        # low_cpu_mem_usage=True,
        torch_dtype=torch.bfloat16
    ).to(device).eval()
    # Use INT4 weight infer
    # model = AutoModelForCausalLM.from_pretrained(
    #     MODEL_PATH,
    #     trust_remote_code=True,
    #     quantization_config=BitsAndBytesConfig(load_in_4bit=True),
    #     low_cpu_mem_usage=True,
    # ).eval()
    times = []
    decode_times = []
--- a/composite_demo/src/utils.py
+++ b/composite_demo/src/utils.py
@ -18,6 +18,7 @@ def extract_docx(path):
    for paragraph in doc.paragraphs:
        data.append(paragraph.text)
    content = '\n\n'.join(data)
    return content
 def extract_pptx(path):
    prs = Presentation(path)