[coati] add chatglm model (#4539)

* update configuration of chatglm and add support in coati * add unit test & update chatglm default config & fix bos index issue * remove chatglm due to oom * add dataset pkg in requirement-text * fix parameter issue in test_models * add ref in tokenize & rm unnessary parts * separate source & target tokenization in chatglm * add unit test to chatglm * fix test dataset issue * update truncation of chatglm * fix Colossalai version * fix colossal ai version in test
2023-08-29 17:58:51 +08:00 · 2023-08-29 17:58:51 +08:00 · 1467e3b41b
parent 0b00def881
commit 1467e3b41b
13 changed files with 2163 additions and 39 deletions
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ b/applications/Chat/coati/dataset/sft_dataset.py
@ -19,7 +19,7 @@ import torch
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import PreTrainedTokenizer
-
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer 
 from colossalai.logging import get_dist_logger

 from .utils import is_rank_0, jload
@ -71,6 +71,42 @@ def _preprocess(sources: Sequence[str],
    return sequences_token["input_ids"], labels, sequences_token["attention_mask"]


+def _preprocess_chatglm(sources: Sequence[str],
+                targets: Sequence[str],
+                tokenizer: PreTrainedTokenizer,
+                max_length: int,
+                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Preprocess the data by tokenizing.
+    None for attention mask, ChatGLM will calculate attention mask according to input ids
+    """
+  
+    labels = []
+    input_ids = []
+    for source, target in zip(sources, targets):
+        source_id = tokenizer.encode(text=source, add_special_tokens=False)
+        target_id = tokenizer.encode(text=target, add_special_tokens=False)
+        input_id = tokenizer.build_inputs_with_special_tokens(source_id, target_id)
+        # truncate
+        sp_token_list = [tokenizer.gmask_token_id, tokenizer.bos_token_id]
+        truncate_length = max(0, len(input_id) - max_length)
+        input_id = input_id[truncate_length: ]
+        if truncate_length == len(source_id) + 1:
+            input_id = sp_token_list + input_id[1: ]
+        elif truncate_length > len(source_id) + 1:
+            input_id = sp_token_list + input_id[2: ]
+        
+        context_length = input_id.index(tokenizer.bos_token_id)
+        mask_position = context_length - 1
+        label = [IGNORE_INDEX] * context_length + input_id[mask_position+1:]
+        
+        pad_len = max_length - len(input_id)
+        input_id = input_id + [tokenizer.pad_token_id] * pad_len
+        input_ids.append(input_id)
+        labels.append(label + [IGNORE_INDEX] * pad_len)
+    return torch.tensor(input_ids), torch.tensor(labels), None
+
+
 class SFTDataset(Dataset):
    """
    Dataset for sft model
@ -94,18 +130,25 @@ class SFTDataset(Dataset):
            data["completion"] + tokenizer.eos_token
            for data in tqdm(dataset, disable=not is_rank_0())
        ]
-
-        self.input_ids, self.labels, self.attention_mask = \
-            _preprocess(sources, targets, tokenizer, max_length)
+        if isinstance(tokenizer, ChatGLMTokenizer):
+            self.input_ids, self.labels, self.attention_mask = \
+                _preprocess_chatglm(sources, targets, tokenizer, max_length)
+        else:
+            self.input_ids, self.labels, self.attention_mask = \
+                _preprocess(sources, targets, tokenizer, max_length)

    def __len__(self):
        length = self.input_ids.shape[0]
        return length

    def __getitem__(self, idx):
-        return dict(input_ids=self.input_ids[idx],
-                    labels=self.labels[idx],
-                    attention_mask=self.attention_mask[idx])
+        if self.attention_mask is not None:
+            return dict(input_ids=self.input_ids[idx],
+                        labels=self.labels[idx],
+                        attention_mask=self.attention_mask[idx])
+        else:
+            return dict(input_ids=self.input_ids[idx],
+                        labels=self.labels[idx])


 class SupervisedDataset(Dataset):
@ -137,14 +180,22 @@ class SupervisedDataset(Dataset):
        ]

        logger.info("Tokenizing inputs... This may take some time...")
-        self.input_ids, self.labels, self.attention_mask = \
-            _preprocess(sources, targets, tokenizer, max_length)
+        if isinstance(tokenizer, ChatGLMTokenizer):
+            self.input_ids, self.labels, self.attention_mask = \
+                _preprocess_chatglm(sources, targets, tokenizer, max_length)
+        else:
+            self.input_ids, self.labels, self.attention_mask = \
+                _preprocess(sources, targets, tokenizer, max_length)

    def __len__(self):
        length = self.input_ids.shape[0]
        return length

    def __getitem__(self, idx):
-        return dict(input_ids=self.input_ids[idx],
-                    labels=self.labels[idx],
-                    attention_mask=self.attention_mask[idx])
+        if self.attention_mask is not None:
+            return dict(input_ids=self.input_ids[idx],
+                        labels=self.labels[idx],
+                        attention_mask=self.attention_mask[idx])
+        else:
+            return dict(input_ids=self.input_ids[idx],
+                        labels=self.labels[idx])
--- a/applications/Chat/coati/models/chatglm/init.py
+++ b/applications/Chat/coati/models/chatglm/init.py
@ -0,0 +1,3 @@
+from .chatglm_actor import ChatGLMActor
+
+__all__ = ['ChatGLMActor']
--- a/applications/Chat/coati/models/chatglm/chatglm_actor.py
+++ b/applications/Chat/coati/models/chatglm/chatglm_actor.py
@ -0,0 +1,34 @@
+from typing import Optional
+
+import torch
+from .configuration_chatglm import ChatGLMConfig
+from .modeling_chatglm import ChatGLMForConditionalGeneration
+
+from ..base import Actor
+
+
+class ChatGLMActor(Actor):
+    """
+    ChatGLM Actor model.
+
+    Args:
+        pretrained (str): Pretrained model name or path.
+        config (ChatGLMConfig): Model config.
+        checkpoint (bool): Enable gradient checkpointing.
+
+    do not support lora for now.
+    """
+
+    def __init__(self,
+                 pretrained: str = None,
+                 config: Optional[ChatGLMConfig] = None,
+                 checkpoint: bool = False) -> None:
+        if pretrained is not None:
+            model = ChatGLMForConditionalGeneration.from_pretrained(pretrained)
+        elif config is not None:
+            model = ChatGLMForConditionalGeneration(config)
+        else:
+            model = ChatGLMForConditionalGeneration(ChatGLMConfig())
+        if checkpoint:
+            model.gradient_checkpointing_enable()
+        super().__init__(model, lora_rank=0, lora_train_bias='none')
--- a/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
+++ b/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
@ -0,0 +1,446 @@
+"""
+This code is copied from https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py
+"""
+"""Tokenization classes for ChatGLM."""
+from typing import List, Optional, Union
+import os
+
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+from typing import Dict
+import sentencepiece as spm
+import numpy as np
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "THUDM/chatglm-6b": 2048,
+}
+
+
+class TextTokenizer:
+    def __init__(self, model_path):
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_path)
+        self.num_tokens = self.sp.vocab_size()
+
+    def encode(self, text):
+        return self.sp.EncodeAsIds(text)
+
+    def decode(self, ids: List[int]):
+        return self.sp.DecodeIds(ids)
+
+    def tokenize(self, text):
+        return self.sp.EncodeAsPieces(text)
+
+    def convert_tokens_to_string(self, tokens):
+        return self.sp.DecodePieces(tokens)
+
+    def convert_tokens_to_ids(self, tokens):
+        return [self.sp.PieceToId(token) for token in tokens]
+
+    def convert_token_to_id(self, token):
+        return self.sp.PieceToId(token)
+
+    def convert_id_to_token(self, idx):
+        return self.sp.IdToPiece(idx)
+
+    def __len__(self):
+        return self.num_tokens
+
+
+class SPTokenizer:
+    def __init__(
+            self,
+            vocab_file,
+            num_image_tokens=20000,
+            max_blank_length=80,
+            byte_fallback=True,
+    ):
+        assert vocab_file is not None
+        self.vocab_file = vocab_file
+        self.num_image_tokens = num_image_tokens
+        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
+        self.max_blank_length = max_blank_length
+        self.byte_fallback = byte_fallback
+        self.text_tokenizer = TextTokenizer(vocab_file)
+
+    def _get_text_tokenizer(self):
+        return self.text_tokenizer
+
+    @staticmethod
+    def get_blank_token(length: int):
+        assert length >= 2
+        return f"<|blank_{length}|>"
+
+    @staticmethod
+    def get_tab_token():
+        return f"<|tab|>"
+
+    @property
+    def num_text_tokens(self):
+        return self.text_tokenizer.num_tokens
+
+    @property
+    def num_tokens(self):
+        return self.num_image_tokens + self.num_text_tokens
+
+    @staticmethod
+    def _encode_whitespaces(text: str, max_len: int = 80):
+        text = text.replace("\t", SPTokenizer.get_tab_token())
+        for i in range(max_len, 1, -1):
+            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
+        return text
+
+    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
+        if linebreak:
+            text = text.replace("\n", "<n>")
+        if whitespaces:
+            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
+        return text
+
+    def encode(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[int]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tmp = self._get_text_tokenizer().encode(text)
+        tokens = [x + self.num_image_tokens for x in tmp]
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def postprocess(self, text):
+        text = text.replace("<n>", "\n")
+        text = text.replace(SPTokenizer.get_tab_token(), "\t")
+        for i in range(2, self.max_blank_length + 1):
+            text = text.replace(self.get_blank_token(i), " " * i)
+        return text
+
+    def decode(self, text_ids: List[int]) -> str:
+        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
+        ids = [_id for _id in ids if _id >= 0]
+        text = self._get_text_tokenizer().decode(ids)
+        text = self.postprocess(text)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self._get_text_tokenizer().convert_tokens_to_string(tokens)
+        text = self.postprocess(text)
+        return text
+
+    def tokenize(
+            self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True
+    ) -> List[str]:
+        """
+        @param text: Text to encode.
+        @param linebreak: Whether to encode newline (\n) in text.
+        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
+        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
+        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
+        """
+        text = self._preprocess(text, linebreak, whitespaces)
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self._get_text_tokenizer().tokenize(text)
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def __getitem__(self, x: Union[int, str]):
+        if isinstance(x, int):
+            if x < self.num_image_tokens:
+                return "<image_{}>".format(x)
+            else:
+                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
+        elif isinstance(x, str):
+            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
+                return int(x[7:-1])
+            else:
+                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
+        else:
+            raise ValueError("The key should be str or int.")
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = {"vocab_file": "ice_text.model"}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(
+            self,
+            vocab_file,
+            do_lower_case=False,
+            remove_space=False,
+            bos_token='<sop>',
+            eos_token='<eop>',
+            end_token='</s>',
+            mask_token='[MASK]',
+            gmask_token='[gMASK]',
+            padding_side="left",
+            pad_token="<pad>",
+            unk_token="<unk>",
+            num_image_tokens=20000,
+            **kwargs
+    ) -> None:
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            padding_side=padding_side,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            end_token=end_token,
+            mask_token=mask_token,
+            gmask_token=gmask_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            num_image_tokens=num_image_tokens,
+            **kwargs
+        )
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.vocab_file = vocab_file
+
+        self.bos_token = bos_token
+        self.eos_token = eos_token
+        self.end_token = end_token
+        self.mask_token = mask_token
+        self.gmask_token = gmask_token
+
+        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
+
+        """ Initialisation """
+
+    @property
+    def gmask_token_id(self) -> Optional[int]:
+        if self.gmask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.gmask_token)
+
+    @property
+    def end_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self.end_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.end_token)
+
+    @property
+    def vocab_size(self):
+        """ Returns vocab size """
+        return self.sp_tokenizer.num_tokens
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text, **kwargs):
+        """ Returns a tokenized string. """
+        text = self.preprocess_text(text)
+
+        seq = self.sp_tokenizer.tokenize(text)
+
+        return seq
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.sp_tokenizer.decode_tokens(tokens)
+
+    def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            **kwargs
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if len(token_ids) == 0:
+            return ""
+        if self.pad_token_id in token_ids:  # remove pad
+            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
+        return super()._decode(token_ids, **kwargs)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.sp_tokenizer[token]
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_tokenizer[index]
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        gmask_id = self.sp_tokenizer[self.gmask_token]
+        eos_id = self.sp_tokenizer[self.eos_token]
+        token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        bos_token_id = self.sp_tokenizer[self.bos_token]
+        mask_token_id = self.sp_tokenizer[self.mask_token]
+        gmask_token_id = self.sp_tokenizer[self.gmask_token]
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if max_length is not None:
+            if "attention_mask" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                attention_mask = np.ones((1, seq_length, seq_length))
+                attention_mask = np.tril(attention_mask)
+                attention_mask[:, :, :context_length] = 1
+                attention_mask = np.bool_(attention_mask < 0.5)
+                encoded_inputs["attention_mask"] = attention_mask
+
+            if "position_ids" not in encoded_inputs:
+                if bos_token_id in required_input:
+                    context_length = required_input.index(bos_token_id)
+                else:
+                    context_length = seq_length
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
+                if mask_token in required_input:
+                    mask_position = required_input.index(mask_token)
+                    position_ids[context_length:] = mask_position
+                block_position_ids = np.concatenate(
+                    [np.zeros(context_length, dtype=np.int64),
+                     np.arange(1, seq_length - context_length + 1, dtype=np.int64)])
+                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
+                                                          pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                                                          mode='constant', constant_values=True)
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                    "token_type_ids"
+                ]
+            if "special_tokens_mask" in encoded_inputs:
+                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
+                                                        pad_width=[(0, 0), (difference, 0)])
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
--- a/applications/Chat/coati/models/chatglm/configuration_chatglm.py
+++ b/applications/Chat/coati/models/chatglm/configuration_chatglm.py
@ -0,0 +1,107 @@
+"""
+This code is copied from https://huggingface.co/THUDM/chatglm-6b/resolve/main/configuration_chatglm.py
+"""
+
+""" ChatGLM model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class ChatGLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~ChatGLMModel`].
+    It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 150528):
+            Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~ChatGLMModel`] or
+            [`~TFChatGLMModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        inner_hidden_size (`int`, *optional*, defaults to 16384):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        max_sequence_length (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models).
+        Example:
+
+    ```python
+    >>> from configuration_chatglm import ChatGLMConfig
+    >>> from modeling_chatglm import ChatGLMModel
+
+    >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
+    >>> configuration = ChatGLMConfig()
+
+    >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
+    >>> model = ChatGLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
+    model_type = "chatglm"
+
+    def __init__(
+            self,
+            vocab_size=130528,
+            hidden_size=4096,
+            num_layers=28,
+            num_attention_heads=32,
+            layernorm_epsilon=1e-5,
+            use_cache=True,
+            bos_token_id=130004,
+            eos_token_id=130005,
+            mask_token_id=130000,
+            gmask_token_id=130001,
+            pad_token_id=3,
+            max_sequence_length=2048,
+            inner_hidden_size=16384,
+            position_encoding_2d=True,
+            quantization_bit=0,
+            pre_seq_len=None,
+            prefix_projection=False,
+            **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.max_sequence_length = max_sequence_length
+        self.layernorm_epsilon = layernorm_epsilon
+        self.inner_hidden_size = inner_hidden_size
+        self.use_cache = use_cache
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+        self.gmask_token_id = gmask_token_id
+        self.position_encoding_2d = position_encoding_2d
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
--- a/applications/Chat/coati/models/chatglm/modeling_chatglm.py
+++ b/applications/Chat/coati/models/chatglm/modeling_chatglm.py
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@ -52,9 +52,13 @@ class SFTTrainer(SLTrainer):
        for batch_id, batch in enumerate(self.train_dataloader):

            batch = to_device(batch, torch.cuda.current_device())
-            outputs = self.model(batch["input_ids"],
-                                 attention_mask=batch["attention_mask"],
-                                 labels=batch["labels"])
+            if "attention_mask" in batch:
+                outputs = self.model(batch["input_ids"],
+                                    attention_mask=batch["attention_mask"],
+                                    labels=batch["labels"])
+            else:
+                outputs = self.model(batch["input_ids"],
+                                    labels=batch["labels"])

            loss = outputs.loss
            loss = loss / self.accumulation_steps
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@ -9,13 +9,15 @@ from coati.models.bloom import BLOOMActor
 from coati.models.gpt import GPTActor
 from coati.models.llama import LlamaActor
 from coati.models.opt import OPTActor
+from coati.models.chatglm import ChatGLMActor
 from coati.trainer import SFTTrainer
 from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
 from datasets import load_dataset
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer
+from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, AutoModel
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 from transformers.trainer import get_scheduler

@ -58,6 +60,8 @@ def train(args):
            model = LlamaActor(pretrained=args.pretrain,
                               lora_rank=args.lora_rank,
                               checkpoint=args.grad_checkpoint)
+        elif args.model == 'chatglm':
+            model = ChatGLMActor(pretrained=args.pretrain)
        else:
            raise ValueError(f'Unsupported model "{args.model}"')

@ -81,6 +85,9 @@ def train(args):
            "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer)
        tokenizer.eos_token = '<\s>'
        tokenizer.pad_token = tokenizer.unk_token
+    elif args.model == 'chatglm':
+        tokenizer = ChatGLMTokenizer.from_pretrained(
+            "THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, trust_remote_code=True)
    else:
        raise ValueError(f'Unsupported model "{args.model}"')

@ -99,7 +106,6 @@ def train(args):
        optim = HybridAdam(model.parameters(), lr=args.lr, clipping_norm=1.0)
    else:
        optim = Adam(model.parameters(), lr=args.lr)
-
    logger = get_dist_logger()

    # configure dataset
@ -185,7 +191,7 @@ if __name__ == '__main__':
    parser.add_argument('--strategy',
                        choices=['ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_zero2_cpu'],
                        default='colossalai_zero2')
-    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
+    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama', 'chatglm'], default='bloom')
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--pretrain', type=str, default=None)
    parser.add_argument('--dataset', type=str, default=None)
--- a/applications/Chat/requirements-test.txt
+++ b/applications/Chat/requirements-test.txt
@ -1 +1,2 @@
 pytest
+colossalai==0.3.1
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@ -2,7 +2,7 @@ transformers>=4.20.1
 tqdm
 datasets
 loralib
-colossalai>=0.2.4
+colossalai==0.3.1
 torch<2.0.0, >=1.12.1
 langchain
 tokenizers
--- a/applications/Chat/tests/test_dataset.py
+++ b/applications/Chat/tests/test_dataset.py
@ -11,7 +11,7 @@ from coati.dataset.sft_dataset import IGNORE_INDEX, SFTDataset, SupervisedDatase
 from datasets import load_dataset
 from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer, PreTrainedTokenizer
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
 SFT_DATASET = [
    {
        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
@ -66,6 +66,8 @@ def make_tokenizer(model: str):
    elif model == "llama":
        tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
        tokenizer.pad_token = tokenizer.unk_token
+    elif model == "chatglm":
+        tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
    else:
        raise ValueError(f"Unsupported model '{model}'")
    return tokenizer
@ -81,13 +83,19 @@ def check_content(input_ids_stripped: torch.Tensor,
    elif model == "llama":
        assert input_ids_stripped[0] == tokenizer.bos_token_id
        input_ids_stripped = input_ids_stripped[1:]
-
+    elif model == "chatglm":
+        assert input_ids_stripped[0] == tokenizer.bos_token_id
+        assert input_ids_stripped[-1] == tokenizer.eos_token_id
+        input_ids_stripped = input_ids_stripped[1:-1]
    assert torch.all(input_ids_stripped != tokenizer.pad_token_id)
    assert torch.all(input_ids_stripped != tokenizer.bos_token_id)
    assert torch.all(input_ids_stripped != tokenizer.eos_token_id)
    assert input_ids_stripped != tokenizer.sep_token_id
    assert input_ids_stripped != tokenizer.cls_token_id
-    assert input_ids_stripped != tokenizer.mask_token_id
+    if model == "chatglm":
+        assert torch.all(input_ids_stripped != tokenizer.mask_token_id)
+    else:
+        assert input_ids_stripped != tokenizer.mask_token_id


@pytest.mark.cpu
@ -189,7 +197,7 @@ def test_reward_dataset(model: str,


@pytest.mark.cpu
-@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
+@pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama", "chatglm"])
@pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
@pytest.mark.parametrize("max_dataset_size", [2])
@pytest.mark.parametrize("max_length", [32, 1024])
@ -213,6 +221,19 @@ def test_sft_dataset(model: str,
                                            max_length=max_length)
        assert len(sft_dataset) == min(max_dataset_size, len(SFT_DATASET))

+    if isinstance(tokenizer, ChatGLMTokenizer):
+        for i in range(max_dataset_size):
+            assert isinstance(sft_dataset[i], dict)
+            assert list(sft_dataset[i].keys()) == ["input_ids", "labels"]
+            input_ids = sft_dataset[i]["input_ids"]
+            labels = sft_dataset[i]["labels"]
+            assert input_ids.shape == labels.shape == torch.Size([max_length])
+            
+            ignore_mask = labels == IGNORE_INDEX
+            assert input_ids.masked_select(torch.logical_not(ignore_mask))[0] == tokenizer.bos_token_id
+            check_content(input_ids.masked_select(torch.logical_not(ignore_mask)), tokenizer, model)
+            return
+        
    for i in range(max_dataset_size):
        assert isinstance(sft_dataset[i], dict)
        assert list(sft_dataset[i].keys()) == ["input_ids", "labels", "attention_mask"]
--- a/applications/Chat/tests/test_models.py
+++ b/applications/Chat/tests/test_models.py
@ -9,11 +9,12 @@ from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
 from coati.models.generation import generate
 from coati.models.gpt import GPTRM, GPTActor, GPTCritic
 from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
+from coati.models.chatglm import ChatGLMActor
 from coati.models.lora import LoraLinear, convert_to_lora_module
 from coati.models.loss import GPTLMLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
 from coati.models.opt import OPTRM, OPTActor, OPTCritic
 from coati.models.utils import calc_action_log_probs, compute_reward, masked_mean
-
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer

@pytest.mark.gpu
@pytest.mark.parametrize("batch_size", [4])
@ -23,7 +24,8 @@ from coati.models.utils import calc_action_log_probs, compute_reward, masked_mea
    lambda: GPTActor(),
    # HACK: skip llama due to long execution time
    # lambda: LlamaActor(),
-    lambda: OPTActor()
+    lambda: OPTActor(),
+    # lambda: ChatGLMActor(),
 ])
@pytest.mark.parametrize("generate_kwargs", [{
    "max_length": 64,
@ -129,12 +131,12 @@ def test_lora(lora_rank: int,
    # HACK: skip llama due to long execution time
    # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
    lambda: (OPTActor(), OPTCritic(), OPTRM()),
+    lambda: (ChatGLMActor(), None, None),
 ])
@torch.no_grad()
 def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
                batch_size: int,
                seq_len: int):
-
    actor_input = {
        "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
        "attention_mask": torch.randint(0, 2, (batch_size, seq_len))
@ -150,20 +152,30 @@ def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
    }

    actor, critic, rm = models_maker()
+    if isinstance(actor, ChatGLMActor):
+        actor = actor.float()
+        tokenizer = ChatGLMTokenizer.from_pretrained( "THUDM/chatglm-6b", trust_remote_code=True)
+        chatglm_special_token = torch.tensor([tokenizer.gmask_token_id, tokenizer.bos_token_id]).repeat(batch_size, 1)
+        actor_input ={
+        "input_ids": torch.cat((torch.randint(0, 100, (batch_size, seq_len//2)), chatglm_special_token, torch.randint(0, 100, (batch_size, seq_len//2 - 2))), dim=1),
+        "attention_mask": torch.randint(0, 2, (batch_size, 1, seq_len, seq_len))
+    }
    assert isinstance(actor, Actor)
    base_actor_model = get_base_model(actor)
-    assert isinstance(critic, Critic)
-    base_critic_model = get_base_model(critic)
-    assert isinstance(rm, RewardModel)
-    base_rm_model = get_base_model(rm)
-
    actor_output = actor(**actor_input)
-    critic_output = critic(**critic_input)
-    rm_output = rm(**rm_input)
-
    assert actor_output.logits.shape[:2] == (batch_size, seq_len)
-    assert critic_output.shape == (batch_size, )
-    assert rm_output.shape == (batch_size, )
+
+    if critic:
+        assert isinstance(critic, Critic)
+        base_critic_model = get_base_model(critic)
+        critic_output = critic(**critic_input)
+        assert critic_output.shape == (batch_size, )
+    
+    if rm:
+        assert isinstance(rm, RewardModel)
+        base_rm_model = get_base_model(rm)
+        rm_output = rm(**rm_input)
+        assert rm_output.shape == (batch_size, )


@pytest.mark.cpu