import argparse import torch import torch.distributed as dist from transformers import AutoTokenizer from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM import colossalai from colossalai.booster import Booster from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin from colossalai.cluster import DistCoordinator def parse_args(): # basic settings parser = argparse.ArgumentParser() parser.add_argument( "--model_name", type=str, default="mistralai/Mixtral-8x7B-v0.1", help="Path to pretrained model or model identifier from huggingface.co/models.", ) parser.add_argument( "--plugin", type=str, default="ep", choices=["ep"], help="Parallel methos.", ) parser.add_argument( "--precision", type=str, default="bf16", choices=["fp32", "bf16", "fp16"], help="The mixed precision training.", ) parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.") # kernel parser.add_argument( "--use_kernel", action="store_true", help="Use kernel optim. Need to install flash attention and triton to enable all kernel optimizations. Skip if not installed.", ) parser.add_argument( "--use_layernorm_kernel", action="store_true", help="Use layernorm kernel. Need to install apex. Raise error if not installed.", ) args = parser.parse_args() return args def main(): args = parse_args() # Launch ColossalAI colossalai.launch_from_torch(seed=args.seed) coordinator = DistCoordinator() config = MixtralConfig.from_pretrained(args.model_name) ep_size = min(dist.get_world_size(), config.num_local_experts) # Set plugin if args.plugin == "ep": plugin = MoeHybridParallelPlugin( tp_size=1, pp_size=1, ep_size=ep_size, zero_stage=1, precision=args.precision, enable_fused_normalization=args.use_layernorm_kernel, enable_jit_fused=args.use_kernel, ) else: raise ValueError(f"Invalid plugin {args.plugin}") coordinator.print_on_master(f"Set plugin as {plugin.__class__.__name__}") # Build mixtral model model = MixtralForCausalLM.from_pretrained(args.model_name) coordinator.print_on_master(f"Finish load model") # Prepare tokenizer and dataloader tokenizer = AutoTokenizer.from_pretrained(args.model_name) # Set booster booster = Booster(plugin=plugin) model, _, _, _, _ = booster.boost(model=model) coordinator.print_on_master(f"Finish init booster") model.eval() if coordinator.rank == 0: text = ["Hello my name is"] else: text = [ "What's the largest country in the world?", "How many people live in China?", "帮我续写这首诗:离离原上草", ] tokenizer.pad_token = tokenizer.unk_token inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch.cuda.current_device()) with torch.no_grad(): outputs = model.module.generate(**inputs, max_new_tokens=20) outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) print(f"[{coordinator.rank}] {outputs}") if __name__ == "__main__": main()