ColossalAI/applications/ColossalMoE/infer.py

import argparse

import torch
import torch.distributed as dist
from transformers import AutoTokenizer
from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM

import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator


def parse_args():
    # basic settings
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name",
        type=str,
        default="mistralai/Mixtral-8x7B-v0.1",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
    parser.add_argument(
        "--plugin",
        type=str,
        default="ep",
        choices=["ep"],
        help="Parallel methos.",
    )
    parser.add_argument(
        "--precision",
        type=str,
        default="bf16",
        choices=["fp32", "bf16", "fp16"],
        help="The mixed precision training.",
    )
    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")

    # kernel
    parser.add_argument(
        "--use_kernel",
        action="store_true",
        help="Use kernel optim. Need to install flash attention and triton to enable all kernel optimizations. Skip if not installed.",
    )
    parser.add_argument(
        "--use_layernorm_kernel",
        action="store_true",
        help="Use layernorm kernel. Need to install apex. Raise error if not installed.",
    )
    args = parser.parse_args()
    return args


def main():
    args = parse_args()

    # Launch ColossalAI
    colossalai.launch_from_torch(seed=args.seed)
    coordinator = DistCoordinator()

    config = MixtralConfig.from_pretrained(args.model_name)
    ep_size = min(dist.get_world_size(), config.num_local_experts)
    # Set plugin
    if args.plugin == "ep":
        plugin = MoeHybridParallelPlugin(
            tp_size=1,
            pp_size=1,
            ep_size=ep_size,
            zero_stage=1,
            precision=args.precision,
            enable_fused_normalization=args.use_layernorm_kernel,
            enable_jit_fused=args.use_kernel,
        )
    else:
        raise ValueError(f"Invalid plugin {args.plugin}")
    coordinator.print_on_master(f"Set plugin as {plugin.__class__.__name__}")

    # Build mixtral model
    model = MixtralForCausalLM.from_pretrained(args.model_name)
    coordinator.print_on_master(f"Finish load model")

    # Prepare tokenizer and dataloader
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    # Set booster
    booster = Booster(plugin=plugin)
    model, _, _, _, _ = booster.boost(model=model)
    coordinator.print_on_master(f"Finish init booster")

    model.eval()

    if coordinator.rank == 0:
        text = ["Hello my name is"]
    else:
        text = [
            "What's the largest country in the world?",
            "How many people live in China?",
            "帮我续写这首诗：离离原上草",
        ]
    tokenizer.pad_token = tokenizer.unk_token
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch.cuda.current_device())

    with torch.no_grad():
        outputs = model.module.generate(**inputs, max_new_tokens=20)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"[{coordinator.rank}] {outputs}")


if __name__ == "__main__":
    main()
[moe] init mixtral impl 12 months ago			`import argparse`

			`import torch`
			`import torch.distributed as dist`
			`from transformers import AutoTokenizer`
			`from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM`

			`import colossalai`
			`from colossalai.booster import Booster`
			`from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin`
			`from colossalai.cluster import DistCoordinator`


			`def parse_args():`
			`# basic settings`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--model_name",`
			`type=str,`
			`default="mistralai/Mixtral-8x7B-v0.1",`
			`help="Path to pretrained model or model identifier from huggingface.co/models.",`
			`)`
			`parser.add_argument(`
			`"--plugin",`
			`type=str,`
[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago			`default="ep",`
[moe] init mixtral impl 12 months ago			`choices=["ep"],`
			`help="Parallel methos.",`
			`)`
			`parser.add_argument(`
			`"--precision",`
			`type=str,`
			`default="bf16",`
			`choices=["fp32", "bf16", "fp16"],`
			`help="The mixed precision training.",`
			`)`
			`parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")`

			`# kernel`
			`parser.add_argument(`
			`"--use_kernel",`
			`action="store_true",`
			`help="Use kernel optim. Need to install flash attention and triton to enable all kernel optimizations. Skip if not installed.",`
			`)`
			`parser.add_argument(`
			`"--use_layernorm_kernel",`
			`action="store_true",`
			`help="Use layernorm kernel. Need to install apex. Raise error if not installed.",`
			`)`
			`args = parser.parse_args()`
			`return args`


			`def main():`
			`args = parse_args()`

			`# Launch ColossalAI`
[misc] refactor launch API and tensor constructor (#5666) * [misc] remove config arg from initialize * [misc] remove old tensor contrusctor * [plugin] add npu support for ddp * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [devops] fix doc test ci * [test] fix test launch * [doc] update launch doc --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 7 months ago			`colossalai.launch_from_torch(seed=args.seed)`
[moe] init mixtral impl 12 months ago			`coordinator = DistCoordinator()`

[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago			`config = MixtralConfig.from_pretrained(args.model_name)`
			`ep_size = min(dist.get_world_size(), config.num_local_experts)`
[moe] init mixtral impl 12 months ago			`# Set plugin`
			`if args.plugin == "ep":`
			`plugin = MoeHybridParallelPlugin(`
[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago			`tp_size=1,`
[moe] init mixtral impl 12 months ago			`pp_size=1,`
[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago			`ep_size=ep_size,`
			`zero_stage=1,`
			`precision=args.precision,`
			`enable_fused_normalization=args.use_layernorm_kernel,`
			`enable_jit_fused=args.use_kernel,`
[moe] init mixtral impl 12 months ago			`)`
			`else:`
			`raise ValueError(f"Invalid plugin {args.plugin}")`
			`coordinator.print_on_master(f"Set plugin as {plugin.__class__.__name__}")`

			`# Build mixtral model`
[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago			`model = MixtralForCausalLM.from_pretrained(args.model_name)`
			`coordinator.print_on_master(f"Finish load model")`
[moe] init mixtral impl 12 months ago
			`# Prepare tokenizer and dataloader`
			`tokenizer = AutoTokenizer.from_pretrained(args.model_name)`

			`# Set booster`
[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago			`booster = Booster(plugin=plugin)`
[moe] init mixtral impl 12 months ago			`model, _, _, _, _ = booster.boost(model=model)`
			`coordinator.print_on_master(f"Finish init booster")`

[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago			`model.eval()`
[moe] init mixtral impl 12 months ago
[moe] update capacity computing (#5253) * [moe] top2 allow uneven input * [moe] update capacity computing * [moe] remove debug info * [moe] update capacity computing * [moe] update capacity computing 11 months ago			`if coordinator.rank == 0:`
			`text = ["Hello my name is"]`
			`else:`
[misc] refactor launch API and tensor constructor (#5666) * [misc] remove config arg from initialize * [misc] remove old tensor contrusctor * [plugin] add npu support for ddp * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [devops] fix doc test ci * [test] fix test launch * [doc] update launch doc --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 7 months ago			`text = [`
			`"What's the largest country in the world?",`
			`"How many people live in China?",`
			`"帮我续写这首诗：离离原上草",`
			`]`
[moe] init mixtral impl 12 months ago			`tokenizer.pad_token = tokenizer.unk_token`
			`inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch.cuda.current_device())`
[moe] support mixtral (#5309) * [moe] add mixtral block for single expert * [moe] mixtral block fwd support uneven ep * [moe] mixtral block bwd support uneven ep * [moe] add mixtral moe layer * [moe] simplify replace * [meo] support save sharded mixtral * [meo] support load sharded mixtral * [meo] support save sharded optim * [meo] integrate moe manager into plug * [meo] fix optimizer load * [meo] fix mixtral layer 10 months ago
			`with torch.no_grad():`
			`outputs = model.module.generate(**inputs, max_new_tokens=20)`
			`outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)`
[moe] update capacity computing (#5253) * [moe] top2 allow uneven input * [moe] update capacity computing * [moe] remove debug info * [moe] update capacity computing * [moe] update capacity computing 11 months ago			`print(f"[{coordinator.rank}] {outputs}")`
[moe] init mixtral impl 12 months ago

			`if __name__ == "__main__":`
			`main()`