ColossalAI/examples/language/grok-1/inference_tp.py

import time

import torch
from grok1_policy import Grok1ForCausalLMPolicy
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import get_defualt_parser, inference, print_output

import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import HybridParallelPlugin
from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext
from colossalai.utils import get_current_device

if __name__ == "__main__":
    parser = get_defualt_parser()
    args = parser.parse_args()
    start = time.time()
    colossalai.launch_from_torch({})
    coordinator = DistCoordinator()
    plugin = HybridParallelPlugin(
        tp_size=coordinator.world_size,
        pp_size=1,
        precision="bf16",
        parallel_output=False,
        custom_policy=Grok1ForCausalLMPolicy(),
    )
    booster = Booster(plugin=plugin)
    torch.set_default_dtype(torch.bfloat16)

    tokenizer = AutoTokenizer.from_pretrained(args.pretrained, trust_remote_code=True)

    with LazyInitContext(default_device=get_current_device()):
        model = AutoModelForCausalLM.from_pretrained(
            args.pretrained, trust_remote_code=True, torch_dtype=torch.bfloat16
        )
    model, *_ = booster.boost(model)
    model.eval()
    init_time = time.time() - start

    for text in args.text:
        output = inference(
            model.unwrap(),
            tokenizer,
            text,
            max_new_tokens=args.max_new_tokens,
            do_sample=args.do_sample,
            temperature=args.temperature,
            top_k=args.top_k,
            top_p=args.top_p,
        )
        if coordinator.is_master():
            print_output(text, tokenizer.decode(output))

    overall_time = time.time() - start
    gen_latency = overall_time - init_time
    avg_gen_latency = gen_latency / len(args.text)
    coordinator.print_on_master(
        f"Initializing time: {init_time:.2f} seconds.\n"
        f"Overall time: {overall_time:.2f} seconds. \n"
        f"Generation latency: {gen_latency:.2f} seconds. \n"
        f"Average generation latency: {avg_gen_latency:.2f} seconds. \n"
    )
[example] add grok-1 inference (#5485) * [misc] add submodule * remove submodule * [example] support grok-1 tp inference * [example] add grok-1 inference script * [example] refactor code * [example] add grok-1 readme * [exmaple] add test ci * [exmaple] update readme 8 months ago			`import time`

			`import torch`
			`from grok1_policy import Grok1ForCausalLMPolicy`
[Fix] Grok-1 use tokenizer from the same pretrained path (#5532) * [fix] use tokenizer from the same pretrained path * trust remote code 8 months ago			`from transformers import AutoModelForCausalLM, AutoTokenizer`
[example] add grok-1 inference (#5485) * [misc] add submodule * remove submodule * [example] support grok-1 tp inference * [example] add grok-1 inference script * [example] refactor code * [example] add grok-1 readme * [exmaple] add test ci * [exmaple] update readme 8 months ago			`from utils import get_defualt_parser, inference, print_output`

			`import colossalai`
			`from colossalai.booster import Booster`
			`from colossalai.booster.plugin import HybridParallelPlugin`
			`from colossalai.cluster import DistCoordinator`
			`from colossalai.lazy import LazyInitContext`
			`from colossalai.utils import get_current_device`

			`if __name__ == "__main__":`
			`parser = get_defualt_parser()`
			`args = parser.parse_args()`
			`start = time.time()`
			`colossalai.launch_from_torch({})`
			`coordinator = DistCoordinator()`
			`plugin = HybridParallelPlugin(`
			`tp_size=coordinator.world_size,`
			`pp_size=1,`
			`precision="bf16",`
			`parallel_output=False,`
			`custom_policy=Grok1ForCausalLMPolicy(),`
			`)`
			`booster = Booster(plugin=plugin)`
			`torch.set_default_dtype(torch.bfloat16)`
[Fix] Grok-1 use tokenizer from the same pretrained path (#5532) * [fix] use tokenizer from the same pretrained path * trust remote code 8 months ago
			`tokenizer = AutoTokenizer.from_pretrained(args.pretrained, trust_remote_code=True)`

[example] add grok-1 inference (#5485) * [misc] add submodule * remove submodule * [example] support grok-1 tp inference * [example] add grok-1 inference script * [example] refactor code * [example] add grok-1 readme * [exmaple] add test ci * [exmaple] update readme 8 months ago			`with LazyInitContext(default_device=get_current_device()):`
			`model = AutoModelForCausalLM.from_pretrained(`
			`args.pretrained, trust_remote_code=True, torch_dtype=torch.bfloat16`
			`)`
			`model, *_ = booster.boost(model)`
[example] update Grok-1 inference (#5495) * revise grok-1 example * remove unused arg in scripts * prevent re-installing torch * update readme * revert modifying colossalai requirements * add perf * trivial * add tokenizer url 8 months ago			`model.eval()`
			`init_time = time.time() - start`

[example] add grok-1 inference (#5485) * [misc] add submodule * remove submodule * [example] support grok-1 tp inference * [example] add grok-1 inference script * [example] refactor code * [example] add grok-1 readme * [exmaple] add test ci * [exmaple] update readme 8 months ago			`for text in args.text:`
			`output = inference(`
			`model.unwrap(),`
[example] update Grok-1 inference (#5495) * revise grok-1 example * remove unused arg in scripts * prevent re-installing torch * update readme * revert modifying colossalai requirements * add perf * trivial * add tokenizer url 8 months ago			`tokenizer,`
[example] add grok-1 inference (#5485) * [misc] add submodule * remove submodule * [example] support grok-1 tp inference * [example] add grok-1 inference script * [example] refactor code * [example] add grok-1 readme * [exmaple] add test ci * [exmaple] update readme 8 months ago			`text,`
			`max_new_tokens=args.max_new_tokens,`
			`do_sample=args.do_sample,`
			`temperature=args.temperature,`
			`top_k=args.top_k,`
			`top_p=args.top_p,`
			`)`
			`if coordinator.is_master():`
[example] update Grok-1 inference (#5495) * revise grok-1 example * remove unused arg in scripts * prevent re-installing torch * update readme * revert modifying colossalai requirements * add perf * trivial * add tokenizer url 8 months ago			`print_output(text, tokenizer.decode(output))`

			`overall_time = time.time() - start`
			`gen_latency = overall_time - init_time`
			`avg_gen_latency = gen_latency / len(args.text)`
			`coordinator.print_on_master(`
			`f"Initializing time: {init_time:.2f} seconds.\n"`
			`f"Overall time: {overall_time:.2f} seconds. \n"`
			`f"Generation latency: {gen_latency:.2f} seconds. \n"`
			`f"Average generation latency: {avg_gen_latency:.2f} seconds. \n"`
			`)`