ColossalAI/tests/kit/model_zoo/transformers/mistral.py

import torch
import transformers
from transformers import MistralConfig

from ..registry import ModelAttribute, model_zoo

# ===============================
# Register single-sentence Mistral
# ===============================


def data_gen():
    # Generated from following code snippet
    #
    # from transformers import AutoModelForCausalLM, AutoTokenizer
    # tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
    # input = 'My favourite condiment is vinegar' (last two words repeated to satisfy length requirement)
    # tokenized_input = tokenizer([input], return_tensors="pt")
    # input_ids = tokenized_input['input_ids']
    # attention_mask = tokenized_input['attention_mask']
    input_ids = torch.tensor([[1, 1984, 16020, 2076, 2487, 349, 21375, 4749]], dtype=torch.int64)
    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
    return dict(input_ids=input_ids, attention_mask=attention_mask)


def data_gen_for_lm():
    # LM data gen
    # the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
    data = data_gen()
    data["labels"] = data["input_ids"].clone()
    return data


def data_gen_for_sequence_classification():
    # sequence classification data gen
    data = data_gen()
    data["labels"] = torch.tensor([1], dtype=torch.int64)
    return data


# define output transform function
output_transform_fn = lambda x: x

# define loss function
loss_fn_for_mistral_model = lambda x: torch.nn.functional.mse_loss(
    x.last_hidden_state, torch.ones_like(x.last_hidden_state)
)
loss_fn = lambda x: x.loss
loss_fn_for_seq_classification = lambda output: output.logits.mean()

config = MistralConfig(
    hidden_size=256, intermediate_size=256, num_attention_heads=64, num_hidden_layers=2, vocab_size=50258
)

if hasattr(config, "pad_token_id"):
    config.pad_token_id = config.eos_token_id

model_zoo.register(
    name="transformers_mistral",
    model_fn=lambda: transformers.MistralModel(config),
    data_gen_fn=data_gen,
    output_transform_fn=output_transform_fn,
    loss_fn=loss_fn_for_mistral_model,
    model_attribute=ModelAttribute(has_control_flow=True),
)
model_zoo.register(
    name="transformers_mistral_for_causal_lm",
    model_fn=lambda: transformers.MistralForCausalLM(config),
    data_gen_fn=data_gen_for_lm,
    output_transform_fn=output_transform_fn,
    loss_fn=loss_fn,
    model_attribute=ModelAttribute(has_control_flow=True),
)
model_zoo.register(
    name="transformers_mistral_for_sequence_classification",
    model_fn=lambda: transformers.MistralForSequenceClassification(config),
    data_gen_fn=data_gen_for_sequence_classification,
    output_transform_fn=output_transform_fn,
    loss_fn=loss_fn_for_seq_classification,
    model_attribute=ModelAttribute(has_control_flow=True),
)
[shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088) * [shardformer] implement policy for all GPT-J models and test * [shardformer] support interleaved pipeline parallel for bert finetune * [shardformer] shardformer support falcon (#4883) * [shardformer]: fix interleaved pipeline for bert model (#5048) * [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093) * Add Mistral support for Shardformer (#5103) * [shardformer] add tests to mistral (#5105) --------- Co-authored-by: Pengtai Xu <henryxu880@gmail.com> Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com> Co-authored-by: flybird11111 <1829166702@qq.com> Co-authored-by: eric8607242 <e0928021388@gmail.com> 2023-11-28 08:54:42 +00:00			`import torch`
			`import transformers`
			`from transformers import MistralConfig`

			`from ..registry import ModelAttribute, model_zoo`

			`# ===============================`
			`# Register single-sentence Mistral`
			`# ===============================`


			`def data_gen():`
			`# Generated from following code snippet`
			`#`
			`# from transformers import AutoModelForCausalLM, AutoTokenizer`
			`# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")`
			`# input = 'My favourite condiment is vinegar' (last two words repeated to satisfy length requirement)`
			`# tokenized_input = tokenizer([input], return_tensors="pt")`
			`# input_ids = tokenized_input['input_ids']`
			`# attention_mask = tokenized_input['attention_mask']`
			`input_ids = torch.tensor([[1, 1984, 16020, 2076, 2487, 349, 21375, 4749]], dtype=torch.int64)`
			`attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)`
			`return dict(input_ids=input_ids, attention_mask=attention_mask)`


			`def data_gen_for_lm():`
			`# LM data gen`
			# the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
			`data = data_gen()`
			`data["labels"] = data["input_ids"].clone()`
			`return data`


			`def data_gen_for_sequence_classification():`
			`# sequence classification data gen`
			`data = data_gen()`
			`data["labels"] = torch.tensor([1], dtype=torch.int64)`
			`return data`


			`# define output transform function`
			`output_transform_fn = lambda x: x`

			`# define loss function`
			`loss_fn_for_mistral_model = lambda x: torch.nn.functional.mse_loss(`
			`x.last_hidden_state, torch.ones_like(x.last_hidden_state)`
			`)`
			`loss_fn = lambda x: x.loss`
			`loss_fn_for_seq_classification = lambda output: output.logits.mean()`

			`config = MistralConfig(`
			`hidden_size=256, intermediate_size=256, num_attention_heads=64, num_hidden_layers=2, vocab_size=50258`
			`)`

[shardformer] update transformers (#5583) * flash_attention forward upgrade * llama_model_forward * remove useless comment * update the requirements.txt * add the transformers version requirements * remove the LATEST VERSION try * [shardformer] update bloom model (#5518) * update bloom model * remove the version restriction * [shardformer] update_falcon (#5520) * [shardformer] update mistral model (#5511) * [shardformer] update gpt2 (#5502) * [shardformer] update gptj model (#5503) * [shardformer] update opt (#5522) * [shardformer] update t5 model (#5524) * [shardformer] update whisper model (#5529) * [shardformer] update vit model (#5530) * update vit model * remove the output_hidden_states * [shardformer] fix llama modeling * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [zero] support multiple (partial) backward passes (#5596) * [zero] support multiple (partial) backward passes * [misc] update requirements * [zero] support multiple (partial) backward passes (#5596) * [zero] support multiple (partial) backward passes * [misc] update requirements * fix conflicts * [doc] fix ColossalMoE readme (#5599) * fix readme * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * merge with main * merge with main * llama_model_forward * remove useless comment * remove the LATEST VERSION try * [shardformer] update bloom model (#5518) * update bloom model * remove the version restriction * [shardformer] update mistral model (#5511) * [shardformer] update opt (#5522) * [shardformer] update whisper model (#5529) * [shardformer] fix llama modeling * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [hotfix] Fix examples no pad token & auto parallel codegen bug; (#5606) * fix no pad token bug * fixed some auto parallel codegen bug, but might not run on torch 2.1 --------- Co-authored-by: Edenzzzz <wtan45@wisc.edu> * [shardformer] fix pipeline grad ckpt (#5620) * [shardformer] fix pipeline grad ckpt * [shardformer] fix whisper (#5628) * [test] fix llama model test * fix the opt upgrade (#5634) * [shardformer] fix attn replacement (#5636) * [shardformer] update flashattention replacement (#5637) * update transformers update transformers fix fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * [test] fix llama test (#5638) * [gemini] fix buffer cast (#5639) * Fix shardformer upgrade (#5640) * fix llama model * fix the mistral * fix the shardformer model * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * [shardformer]support pipeline parallelism for mistral. (#5642) * [shardformer] fix attn replacement (#5636) * [shardformer] update flashattention replacement (#5637) * update transformers update transformers fix fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * [Feature] Support LLaMA-3 CPT and ST (#5619) * support LLaMA-3 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Run pre-commit --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * [exampe] update llama example (#5626) * [plugin] support dp inside for hybriad parallel * [example] update llama benchmark * [example] update llama benchmark * [example] update llama readme * [example] update llama readme * [example] llama3 (#5631) * release llama3 * [release] llama3 * [release] llama3 * [release] llama3 * [release] llama3 * [test] fix llama test (#5638) * [gemini] fix buffer cast (#5639) * support pp for mistral * fix * fix fix fix * fix --------- Co-authored-by: Hongxin Liu <lhx0217@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> --------- Co-authored-by: Hongxin Liu <lhx0217@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu> Co-authored-by: Edenzzzz <wtan45@wisc.edu> Co-authored-by: flybird11111 <1829166702@qq.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> 2024-04-24 14:51:50 +00:00			`if hasattr(config, "pad_token_id"):`
			`config.pad_token_id = config.eos_token_id`

[shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088) * [shardformer] implement policy for all GPT-J models and test * [shardformer] support interleaved pipeline parallel for bert finetune * [shardformer] shardformer support falcon (#4883) * [shardformer]: fix interleaved pipeline for bert model (#5048) * [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093) * Add Mistral support for Shardformer (#5103) * [shardformer] add tests to mistral (#5105) --------- Co-authored-by: Pengtai Xu <henryxu880@gmail.com> Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com> Co-authored-by: flybird11111 <1829166702@qq.com> Co-authored-by: eric8607242 <e0928021388@gmail.com> 2023-11-28 08:54:42 +00:00			`model_zoo.register(`
			`name="transformers_mistral",`
			`model_fn=lambda: transformers.MistralModel(config),`
			`data_gen_fn=data_gen,`
			`output_transform_fn=output_transform_fn,`
			`loss_fn=loss_fn_for_mistral_model,`
			`model_attribute=ModelAttribute(has_control_flow=True),`
			`)`
			`model_zoo.register(`
[Feature] Zigzag Ring attention (#5905) * halfway * fix cross-PP-stage position id length diff bug * fix typo * fix typo * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * unified cross entropy func for all shardformer models * remove redundant lines * add basic ring attn; debug cross entropy * fwd bwd logic complete * fwd bwd logic complete; add experimental triton rescale * precision tests passed * precision tests passed * fix typos and remove misc files * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add sp_mode to benchmark; fix varlen interface * update softmax_lse shape by new interface * change tester name * remove buffer clone; support packed seq layout * add varlen tests * fix typo * all tests passed * add dkv_group; fix mask * remove debug statements --------- Co-authored-by: Edenzzzz <wtan45@wisc.edu> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2024-08-16 05:56:38 +00:00			`name="transformers_mistral_for_causal_lm",`
[shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088) * [shardformer] implement policy for all GPT-J models and test * [shardformer] support interleaved pipeline parallel for bert finetune * [shardformer] shardformer support falcon (#4883) * [shardformer]: fix interleaved pipeline for bert model (#5048) * [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093) * Add Mistral support for Shardformer (#5103) * [shardformer] add tests to mistral (#5105) --------- Co-authored-by: Pengtai Xu <henryxu880@gmail.com> Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com> Co-authored-by: flybird11111 <1829166702@qq.com> Co-authored-by: eric8607242 <e0928021388@gmail.com> 2023-11-28 08:54:42 +00:00			`model_fn=lambda: transformers.MistralForCausalLM(config),`
			`data_gen_fn=data_gen_for_lm,`
			`output_transform_fn=output_transform_fn,`
			`loss_fn=loss_fn,`
			`model_attribute=ModelAttribute(has_control_flow=True),`
			`)`
			`model_zoo.register(`
			`name="transformers_mistral_for_sequence_classification",`
			`model_fn=lambda: transformers.MistralForSequenceClassification(config),`
			`data_gen_fn=data_gen_for_sequence_classification,`
			`output_transform_fn=output_transform_fn,`
			`loss_fn=loss_fn_for_seq_classification,`
			`model_attribute=ModelAttribute(has_control_flow=True),`
			`)`