first commit

2024-11-21 15:49:20 +08:00 · 2024-11-21 15:49:20 +08:00 · f2c2596063
parent a59cc03f08
commit f2c2596063
21 changed files with 301847 additions and 2 deletions
--- a/BAAI-Aquila-Model-License-Agreement.pdf
+++ b/BAAI-Aquila-Model-License-Agreement.pdf
--- a/0
+++ b/0
--- a/README.md
+++ b/README.md
@ -1,4 +1,70 @@
-# AquilaChat2-7B_a13670421562519552438590
+---
+license: other
+---


-AquilaChat2-7B
+![Aquila_logo](./log.jpeg)
+
+
+<h4 align="center">
+    <p>
+        <b>English</b> |
+        <a href="https://huggingface.co/BAAI/AquilaChat2-7B/blob/main/README_zh.md">简体中文</a> 
+    </p>
+</h4>
+
+
+We opensource our **Aquila2** series, now including **Aquila2**, the base language models, namely **Aquila2-7B** and **Aquila2-34B**, as well as **AquilaChat2**, the chat models, namely **AquilaChat2-7B** and **AquilaChat2-34B**, as well as the long-text chat models, namely **AquilaChat2-7B-16k** and **AquilaChat2-34B-16k**
+
+The additional details of the Aquila model will be presented in the official technical report. Please stay tuned for updates on official channels.
+
+## Quick Start  AquilaChat2-7B（Chat model）
+
+### 1. Inference
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import BitsAndBytesConfig
+
+device = torch.device("cuda:0")
+model_info = "BAAI/AquilaChat2-7B"
+tokenizer = AutoTokenizer.from_pretrained(model_info, trust_remote_code=True)
+quantization_config=BitsAndBytesConfig(
+                        load_in_4bit=True,
+                        bnb_4bit_use_double_quant=True,
+                        bnb_4bit_quant_type="nf4",
+                        bnb_4bit_compute_dtype=torch.bfloat16,
+                    )
+model = AutoModelForCausalLM.from_pretrained(model_info, trust_remote_code=True, torch_dtype=torch.float16,
+                                                # quantization_config=quantization_config, # Uncomment this line for 4bit quantization
+                                                )
+model.eval()
+model.to(device)
+text = "请给出10个要到北京旅游的理由。"
+from predict import predict
+out = predict(model, text, tokenizer=tokenizer, max_gen_len=200, top_p=0.95,
+              seed=1234, topk=100, temperature=0.9, sft=True, device=device,
+              model_name="AquilaChat2-7B")
+print(out)
+```
+
+
+## License
+
+Aquila2 series open-source model is licensed under [ BAAI Aquila Model Licence Agreement](https://huggingface.co/BAAI/AquilaChat2-7B/blob/main/BAAI-Aquila-Model-License%20-Agreement.pdf)
+
+## Citation
+Feel free to cite the repo if you think Aquila2 is useful.
+
+```python
+@misc{zhang2024aquila2technicalreport,
+      title={Aquila2 Technical Report}, 
+      author={Bo-Wen Zhang and Liangdong Wang and Jijie Li and Shuhao Gu and Xinya Wu and Zhengduo Zhang and Boyan Gao and Yulong Ao and Guang Liu},
+      year={2024},
+      eprint={2408.07410},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2408.07410}, 
+}
+```
--- a/README_zh.md
+++ b/README_zh.md
@ -0,0 +1,51 @@
+---
+license: other
+---
+
+
+![Aquila_logo](./log.jpeg)
+
+
+<h4 align="center">
+    <p>
+        <a href="https://huggingface.co/BAAI/AquilaChat2-7B/blob/main/README.md">English</a> 
+        <b>简体中文</b> |
+    </p>
+</h4>
+
+# 悟道·天鹰（Aquila2）
+
+我们开源了我们的 **Aquila2** 系列，现在包括基础语言模型 **Aquila2-7B** 和 **Aquila2-34B** ，对话模型 **AquilaChat2-7B** 和 **AquilaChat2-34B**，长文本对话模型**AquilaChat2-7B-16k** 和 **AquilaChat2-34B-16k**
+
+悟道 · 天鹰 Aquila 模型的更多细节将在官方技术报告中呈现。请关注官方渠道更新。
+
+## 快速开始使用 AquilaChat2-7B
+
+
+## 使用方式/How to use
+
+### 1. 推理/Inference
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+device = torch.device("cuda")
+model_info = "BAAI/AquilaChat2-7B"
+tokenizer = AutoTokenizer.from_pretrained(model_info, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_info, trust_remote_code=True)
+model.eval()
+model.to(device)
+text = "请给出10个要到北京旅游的理由。"
+tokens = tokenizer.encode_plus(text)['input_ids']
+tokens = torch.tensor(tokens)[None,].to(device)
+stop_tokens = ["###", "[UNK]", "</s>"]
+with torch.no_grad():
+    out = model.generate(tokens, do_sample=True, max_length=512, eos_token_id=100007, bad_words_ids=[[tokenizer.encode(token)[0] for token in stop_tokens]])[0]
+    out = tokenizer.decode(out.cpu().numpy().tolist())
+    print(out)
+```
+
+
+## 证书/License
+
+Aquila2系列开源模型使用 [智源Aquila系列模型许可协议](https://huggingface.co/BAAI/Aquila2-70B/blob/main/BAAI-Aquila-Model-License-Agreement.pdf)
--- a/added_tokens.json
+++ b/added_tokens.json
@ -0,0 +1,10 @@
+{
+  "</s>": 100007,
+  "<|LDWANG|>": 100002,
+  "<|endofpiece|>": 100001,
+  "<|startofpiece|>": 100000,
+  "[CLS]": 100006,
+  "[MASK]": 100003,
+  "[gMASK]": 100004,
+  "[sMASK]": 100005
+}
--- a/config.json
+++ b/config.json
@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "AquilaForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_aquila.AquilaConfig",
+    "AutoModelForCausalLM": "modeling_aquila.AquilaForCausalLM"
+  },
+  "bos_token_id": 100006,
+  "eos_token_id": 100007,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "model_type": "aquila",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.31.0",
+  "use_cache": true,
+  "vocab_size": 100008
+}
--- a/configuration.json
+++ b/configuration.json
@ -0,0 +1 @@
+{"framework": "pytorch", "task": "text-generation", "allow_remote": true}
--- a/configuration_aquila.py
+++ b/configuration_aquila.py
@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Aquila model configuration"""
+
+from transformers import PretrainedConfig
+
+
+
+class AquilaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Aquila-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`AquilaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+
+    ```python
+    >>> from transformers import AquilaModel, AquilaConfig
+
+    >>> # Initializing a Aquila aquila-7b style configuration
+    >>> configuration = AquilaConfig()
+
+    >>> # Initializing a model from the aquila-7b style configuration
+    >>> model = AquilaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "aquila"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=100008,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/generation_config.json
+++ b/generation_config.json
@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 100006,
+  "eos_token_id": 100007,
+  "pad_token_id": 0,
+  "transformers_version": "4.31.0"
+}
--- a/log.jpeg
+++ b/log.jpeg
--- a/merges.txt
+++ b/merges.txt
--- a/modeling_aquila.py
+++ b/modeling_aquila.py
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,446 @@
+"""
+Copied from https://github.com/lm-sys/FastChat.
+Later we will contribute our changes into it.
+"""
+import dataclasses
+from enum import auto, IntEnum
+from typing import List, Any, Dict
+import math
+from typing import List, Optional, Tuple, Union
+import random
+import numpy as np
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers import (
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    TopKLogitsWarper,
+    TemperatureLogitsWarper,
+    TopPLogitsWarper,
+    StoppingCriteriaList,
+    MaxLengthCriteria,
+    BitsAndBytesConfig,
+)
+
+
+
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: List[str] = (("USER", "ASSISTANT"),)
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: str = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def update_last_message(self, message: str):
+        """Update the last output.
+
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f"{template.name} has been registered."
+
+    conv_templates[template.name] = template
+
+
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+
+def get_conversation_template(model_path: str) -> Conversation:
+    """Get the default conversation template."""
+    if "aquila-v1" in model_path:
+        return get_conv_template("aquila-v1")
+    elif "aquila-chat" in model_path:
+        return get_conv_template("aquila-chat")
+    elif "aquila-legacy" in model_path:
+        return get_conv_template("aquila-legacy")
+    else:
+        return get_conv_template("aquila")
+
+# AquilaChat default template
+# source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
+register_conv_template(
+    Conversation(
+        name="aquila-chat",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant", "System"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="###",
+        sep2="",
+        stop_str=["###", "</s>", "[UNK]"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="aquila-legacy",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("### Human: ", "### Assistant: ", "System"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+        stop_str=["</s>", "[UNK]"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="aquila",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant", "System"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="###",
+        sep2="</s>",
+        stop_str=["</s>", "[UNK]"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="aquila-v1",
+        roles=("<|startofpiece|>", "<|endofpiece|>", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="",
+        sep2="</s>",
+        stop_str=["</s>", "<|endoftext|>"],
+    )
+)
+
+
+if __name__ == "__main__":
+    print("aquila template:")
+    conv = get_conv_template("aquila")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+
+    print("\n")
+
+    print("aquila-chat template:")
+    conv = get_conv_template("aquila-chat")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+
+    print("\n")
+
+    print("aquila-v1 template:")
+    conv = get_conv_template("aquila-v1")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+
+    print("\n")
+
+    print("aquila-legacy template:")
+    conv = get_conv_template("aquila-legacy")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+
+    print("\n")
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token, convo_template="aquila-chat"):
+    # aquila-chat as default
+    conv = get_conv_template(convo_template)
+
+    conv.append_message(conv.roles[1], None)
+    conv.append_message(conv.roles[0], text)
+
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    if history is None or not isinstance(history, list):
+      history = []
+
+    while(len(history) > 0 and (len(example) < max_token)):
+        tmp = history.pop()
+        if tmp[0] == 'ASSISTANT':
+            conv.append_message(conv.roles[1], tmp[1])
+        else:
+            conv.append_message(conv.roles[0], tmp[1])
+        example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    if len(example) >= max_token:
+        conv.messages.pop()
+    conv.messages = conv.messages[::-1]
+    print('model in:', conv.get_prompt())
+    example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids']
+
+    return example
+
+def predict(model, text, tokenizer=None,
+            max_gen_len=200, top_p=0.95,
+            seed=1234, topk=100,
+            temperature=0.9, 
+            sft=True, convo_template = "",
+            device = "cuda",
+            model_name="AquilaChat2-7B",
+            history=None,
+            **kwargs):
+
+    vocab = tokenizer.get_vocab()
+
+    id2word = {v:k for k, v in vocab.items()}
+
+    
+    template_map = {"AquilaChat2-7B": "aquila-v1",
+                    "AquilaChat2-34B": "aquila-legacy",
+                    "AquilaChat2-7B-16K": "aquila",
+                    "AquilaChat2-34B-16K": "aquila"}
+    if not convo_template:
+        convo_template=template_map.get(model_name, "aquila-chat")
+
+    set_random_seed(seed)
+    if temperature == 0:
+        topk = 1
+        temperature = 1.0
+    if sft:
+        tokens = covert_prompt_to_input_ids_with_history(text, history=history, tokenizer=tokenizer, max_token=2048, convo_template=convo_template)
+        tokens = torch.tensor(tokens)[None,].to(device)
+    else :
+        tokens = tokenizer.encode_plus(text)["input_ids"]
+        print(tokenizer.decode(tokens))
+        tokens = torch.tensor(tokens)[None,].to(device)
+    input_length = len(tokens[0])
+    with torch.no_grad():
+
+        # instantiate logits processors
+        logits_processor = LogitsProcessorList(
+            [
+                MinLengthLogitsProcessor(1, eos_token_id=100007),
+            ]
+        )
+        # instantiate logits processors
+        logits_warper = LogitsProcessorList(
+            [
+                TopPLogitsWarper(top_p),
+                TopKLogitsWarper(topk),
+                TemperatureLogitsWarper(temperature),
+                
+            ]
+        )
+
+        stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=input_length + max_gen_len)])
+        out = model.sample(
+                            tokens,
+                            logits_processor=logits_processor,
+                            logits_warper=logits_warper,
+                            stopping_criteria=stopping_criteria,
+                            return_dict_in_generate=True, 
+                            output_scores=True,
+                        )
+
+        
+        # print(out)
+        out_ids = out["sequences"][0][input_length:].cpu().numpy()
+
+        out_scores = out["scores"]
+
+        out_scores = torch.cat(out_scores, dim=0)
+        out_scores = torch.nn.functional.softmax(out_scores, dim=-1).cpu().numpy()
+
+        probs = []
+        for i in range(len(out_ids)):
+            probs.append(float(out_scores[i][out_ids[i]]))
+
+        # print(f"probs is {probs}")
+
+        convert_tokens = []
+        for t in out_ids:
+            if t == 100006:
+                convert_tokens.append("[CLS]")
+            else :
+                convert_tokens.append(id2word.get(t, "[unkonwn_token]"))
+
+        out_text = tokenizer.decode(out_ids.tolist())
+        
+
+        out = out_text
+
+    if "[UNK]" in out:
+        special_index = out.index("[UNK]")
+        out = out[:special_index]
+        token_length = len(tokenizer.encode_plus(out)["input_ids"])
+        convert_tokens = convert_tokens[:token_length]
+        probs = probs[:token_length]
+
+    if "</s>" in out:
+        special_index = out.index("</s>")
+        out = out[: special_index]
+        token_length = len(tokenizer.encode_plus(out)["input_ids"])
+        convert_tokens = convert_tokens[:token_length]
+        probs = probs[:token_length]
+
+    if len(out) > 0 and out[0] == " ":
+        out = out[1:]
+
+        convert_tokens = convert_tokens[1:]
+        probs = probs[1:]
+
+    if isinstance(history, list):
+        # Update history
+        history.insert(0, ('ASSISTANT', out))
+        history.insert(0, ('USER', text))
+
+    return out 
--- a/pytorch_model-00001-of-00003.bin
+++ b/pytorch_model-00001-of-00003.bin
--- a/pytorch_model-00002-of-00003.bin
+++ b/pytorch_model-00002-of-00003.bin
--- a/pytorch_model-00003-of-00003.bin
+++ b/pytorch_model-00003-of-00003.bin
--- a/pytorch_model.bin.index.json
+++ b/pytorch_model.bin.index.json
@ -0,0 +1,330 @@
+{
+  "metadata": {
+    "total_size": 29182156800
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00003-of-00003.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
+    "model.norm.weight": "pytorch_model-00003-of-00003.bin"
+  }
+}
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@ -0,0 +1,6 @@
+{
+  "bos_token": "[CLS]",
+  "eos_token": "</s>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@ -0,0 +1,10 @@
+{
+  "add_prefix_space": false,
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "model_max_length": 2048,
+  "padding_side": "right",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}
--- a/vocab.json
+++ b/vocab.json
				`@ -0,0 +1 @@`
				`{"framework": "pytorch", "task": "text-generation", "allow_remote": true}`