ColossalAI/examples/language/performance_evaluator.py

from time import time
from typing import Optional

import torch
import torch.distributed as dist
from torch import Tensor

from colossalai.accelerator import get_accelerator
from colossalai.cluster import DistCoordinator


def divide(x: float, y: float) -> float:
    if y == 0:
        return float("inf")
    elif y == float("inf"):
        return float("nan")
    return x / y


@torch.no_grad()
def all_reduce_mean(x: float, world_size: int) -> float:
    if world_size == 1:
        return x
    tensor = torch.tensor([x], device=get_accelerator().get_current_device())
    dist.all_reduce(tensor)
    tensor = tensor / world_size
    return tensor.item()


class Timer:
    def __init__(self) -> None:
        self.start_time: Optional[float] = None
        self.duration: float = 0.0

    def start(self) -> None:
        self.start_time = time()

    def end(self) -> None:
        assert self.start_time is not None
        self.duration += time() - self.start_time
        self.start_time = None

    def reset(self) -> None:
        self.duration = 0.0


class PerformanceEvaluator:
    """
        Callback for valuate the performance of the model.
    Args:
        actor_num_params: The number of parameters of the actor model.
        critic_num_params: The number of parameters of the critic model.
        initial_model_num_params: The number of parameters of the initial model.
        reward_model_num_params: The number of parameters of the reward model.
        enable_grad_checkpoint: Whether to enable gradient checkpointing.
        ignore_episodes: The number of episodes to ignore when calculating the performance.
    """

    def __init__(
        self,
        model_numel: int,
        num_layers: int,
        hidden_size: int,
        vocab_size: int,
        enable_grad_checkpoint: bool = False,
        ignore_steps: int = 0,
        dp_world_size: Optional[int] = None,
    ) -> None:
        self.model_numel = model_numel
        self.enable_grad_checkpoint = enable_grad_checkpoint
        self.ignore_steps = ignore_steps
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.coordinator = DistCoordinator()
        self.dp_world_size = dp_world_size or self.coordinator.world_size
        self.disable: bool = False
        self.timer = Timer()
        self.num_samples: int = 0
        self.flop_megatron = 0
        self.flop: int = 0

    def on_step_start(self, step: int) -> None:
        self.disable = self.ignore_steps > 0 and step < self.ignore_steps
        if self.disable:
            return
        get_accelerator().synchronize()
        self.timer.start()

    def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
        if self.disable:
            return
        get_accelerator().synchronize()
        self.timer.end()

        batch_size, seq_len = input_ids.shape

        self.num_samples += batch_size
        checkpoint_activations_factor = 3 + int(self.enable_grad_checkpoint)
        self.flop_megatron += (
            24 * checkpoint_activations_factor * batch_size * seq_len * self.num_layers * (self.hidden_size**2)
        ) * (
            1.0 + (seq_len / (6.0 * self.hidden_size)) + (self.vocab_size / (16.0 * self.num_layers * self.hidden_size))
        )
        self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))

    def on_fit_end(self) -> None:
        avg_duration = all_reduce_mean(self.timer.duration, self.coordinator.world_size)
        avg_throughput = self.num_samples * self.dp_world_size / (avg_duration + 1e-12)
        mp_world_size = self.coordinator.world_size // self.dp_world_size
        avg_tflops_per_gpu_megatron = self.flop_megatron / 1e12 / (avg_duration + 1e-12) / mp_world_size
        avg_tflops_per_gpu = self.flop / 1e12 / (avg_duration + 1e-12) / mp_world_size
        self.coordinator.print_on_master(
            f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop_megatron: {self.flop_megatron}, flop: {self.flop}, avg_duration: {avg_duration}, "
            f"avg_throughput: {avg_throughput}"
        )
        self.coordinator.print_on_master(
            f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU by Megatron: {avg_tflops_per_gpu_megatron:.2f}, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}"
        )
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`from time import time`
			`from typing import Optional`

			`import torch`
			`import torch.distributed as dist`
			`from torch import Tensor`

[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`from colossalai.accelerator import get_accelerator`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`from colossalai.cluster import DistCoordinator`


			`def divide(x: float, y: float) -> float:`
			`if y == 0:`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`return float("inf")`
			`elif y == float("inf"):`
			`return float("nan")`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`return x / y`


			`@torch.no_grad()`
			`def all_reduce_mean(x: float, world_size: int) -> float:`
			`if world_size == 1:`
			`return x`
[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`tensor = torch.tensor([x], device=get_accelerator().get_current_device())`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`dist.all_reduce(tensor)`
			`tensor = tensor / world_size`
			`return tensor.item()`


			`class Timer:`
			`def __init__(self) -> None:`
			`self.start_time: Optional[float] = None`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`self.duration: float = 0.0`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago
			`def start(self) -> None:`
			`self.start_time = time()`

			`def end(self) -> None:`
			`assert self.start_time is not None`
			`self.duration += time() - self.start_time`
			`self.start_time = None`

			`def reset(self) -> None:`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`self.duration = 0.0`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago

			`class PerformanceEvaluator:`
			`"""`
			`Callback for valuate the performance of the model.`
			`Args:`
			`actor_num_params: The number of parameters of the actor model.`
			`critic_num_params: The number of parameters of the critic model.`
			`initial_model_num_params: The number of parameters of the initial model.`
			`reward_model_num_params: The number of parameters of the reward model.`
			`enable_grad_checkpoint: Whether to enable gradient checkpointing.`
			`ignore_episodes: The number of episodes to ignore when calculating the performance.`
			`"""`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`def __init__(`
			`self,`
			`model_numel: int,`
[pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017) * Use p2p * Cannot bidirectonal send p2p * Refactor tensor creation and serialization in P2P communication * Fix llama forward args in flash attention * Add flop estimate from megatron * Support loading weight not in weight_map when strict=False in hybrid_parallel * Use send_forward_recv_backward, etc in 1f1b * Use dataclass for metdata Remove torch.cuda.synchronize() as suggested * Add comment about the torch.cuda.synchronize for potential error * Typo * Update hybrid_parallel_checkpoint_io.py * Update p2p.py * Update one_f_one_b.py * Update p2p.py --------- Co-authored-by: flybird11111 <1829166702@qq.com> 1 year ago			`num_layers: int,`
			`hidden_size: int,`
			`vocab_size: int,`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`enable_grad_checkpoint: bool = False,`
			`ignore_steps: int = 0,`
			`dp_world_size: Optional[int] = None,`
			`) -> None:`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`self.model_numel = model_numel`
			`self.enable_grad_checkpoint = enable_grad_checkpoint`
			`self.ignore_steps = ignore_steps`
[pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017) * Use p2p * Cannot bidirectonal send p2p * Refactor tensor creation and serialization in P2P communication * Fix llama forward args in flash attention * Add flop estimate from megatron * Support loading weight not in weight_map when strict=False in hybrid_parallel * Use send_forward_recv_backward, etc in 1f1b * Use dataclass for metdata Remove torch.cuda.synchronize() as suggested * Add comment about the torch.cuda.synchronize for potential error * Typo * Update hybrid_parallel_checkpoint_io.py * Update p2p.py * Update one_f_one_b.py * Update p2p.py --------- Co-authored-by: flybird11111 <1829166702@qq.com> 1 year ago			`self.num_layers = num_layers`
			`self.hidden_size = hidden_size`
			`self.vocab_size = vocab_size`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago
			`self.coordinator = DistCoordinator()`
			`self.dp_world_size = dp_world_size or self.coordinator.world_size`
			`self.disable: bool = False`
			`self.timer = Timer()`
			`self.num_samples: int = 0`
[pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017) * Use p2p * Cannot bidirectonal send p2p * Refactor tensor creation and serialization in P2P communication * Fix llama forward args in flash attention * Add flop estimate from megatron * Support loading weight not in weight_map when strict=False in hybrid_parallel * Use send_forward_recv_backward, etc in 1f1b * Use dataclass for metdata Remove torch.cuda.synchronize() as suggested * Add comment about the torch.cuda.synchronize for potential error * Typo * Update hybrid_parallel_checkpoint_io.py * Update p2p.py * Update one_f_one_b.py * Update p2p.py --------- Co-authored-by: flybird11111 <1829166702@qq.com> 1 year ago			`self.flop_megatron = 0`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`self.flop: int = 0`

			`def on_step_start(self, step: int) -> None:`
			`self.disable = self.ignore_steps > 0 and step < self.ignore_steps`
			`if self.disable:`
			`return`
[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`get_accelerator().synchronize()`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`self.timer.start()`

			`def on_step_end(self, input_ids: Tensor, **kwargs) -> None:`
			`if self.disable:`
			`return`
[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`get_accelerator().synchronize()`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`self.timer.end()`

			`batch_size, seq_len = input_ids.shape`

			`self.num_samples += batch_size`
[format] applied code formatting on changed files in pull request 5067 (#5072) Co-authored-by: github-actions <github-actions@github.com> 1 year ago			`checkpoint_activations_factor = 3 + int(self.enable_grad_checkpoint)`
			`self.flop_megatron += (`
			`24 * checkpoint_activations_factor * batch_size * seq_len * self.num_layers * (self.hidden_size**2)`
			`) * (`
			`1.0 + (seq_len / (6.0 * self.hidden_size)) + (self.vocab_size / (16.0 * self.num_layers * self.hidden_size))`
			`)`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))`

			`def on_fit_end(self) -> None:`
			`avg_duration = all_reduce_mean(self.timer.duration, self.coordinator.world_size)`
			`avg_throughput = self.num_samples * self.dp_world_size / (avg_duration + 1e-12)`
			`mp_world_size = self.coordinator.world_size // self.dp_world_size`
[pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017) * Use p2p * Cannot bidirectonal send p2p * Refactor tensor creation and serialization in P2P communication * Fix llama forward args in flash attention * Add flop estimate from megatron * Support loading weight not in weight_map when strict=False in hybrid_parallel * Use send_forward_recv_backward, etc in 1f1b * Use dataclass for metdata Remove torch.cuda.synchronize() as suggested * Add comment about the torch.cuda.synchronize for potential error * Typo * Update hybrid_parallel_checkpoint_io.py * Update p2p.py * Update one_f_one_b.py * Update p2p.py --------- Co-authored-by: flybird11111 <1829166702@qq.com> 1 year ago			`avg_tflops_per_gpu_megatron = self.flop_megatron / 1e12 / (avg_duration + 1e-12) / mp_world_size`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`avg_tflops_per_gpu = self.flop / 1e12 / (avg_duration + 1e-12) / mp_world_size`
			`self.coordinator.print_on_master(`
[pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017) * Use p2p * Cannot bidirectonal send p2p * Refactor tensor creation and serialization in P2P communication * Fix llama forward args in flash attention * Add flop estimate from megatron * Support loading weight not in weight_map when strict=False in hybrid_parallel * Use send_forward_recv_backward, etc in 1f1b * Use dataclass for metdata Remove torch.cuda.synchronize() as suggested * Add comment about the torch.cuda.synchronize for potential error * Typo * Update hybrid_parallel_checkpoint_io.py * Update p2p.py * Update one_f_one_b.py * Update p2p.py --------- Co-authored-by: flybird11111 <1829166702@qq.com> 1 year ago			`f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop_megatron: {self.flop_megatron}, flop: {self.flop}, avg_duration: {avg_duration}, "`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`f"avg_throughput: {avg_throughput}"`
			`)`
[example] add llama2 example (#4527) * [example] transfer llama-1 example * [example] fit llama-2 * [example] refactor scripts folder * [example] fit new gemini plugin * [cli] fix multinode runner * [example] fit gemini optim checkpoint * [example] refactor scripts * [example] update requirements * [example] update requirements * [example] rename llama to llama2 * [example] update readme and pretrain script * [example] refactor scripts 1 year ago			`self.coordinator.print_on_master(`
[pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017) * Use p2p * Cannot bidirectonal send p2p * Refactor tensor creation and serialization in P2P communication * Fix llama forward args in flash attention * Add flop estimate from megatron * Support loading weight not in weight_map when strict=False in hybrid_parallel * Use send_forward_recv_backward, etc in 1f1b * Use dataclass for metdata Remove torch.cuda.synchronize() as suggested * Add comment about the torch.cuda.synchronize for potential error * Typo * Update hybrid_parallel_checkpoint_io.py * Update p2p.py * Update one_f_one_b.py * Update p2p.py --------- Co-authored-by: flybird11111 <1829166702@qq.com> 1 year ago			`f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU by Megatron: {avg_tflops_per_gpu_megatron:.2f}, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}"`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`)`