ColossalAI/applications/ColossalChat/coati/trainer/utils.py

"""
Training utilities for Coati.
"""

from typing import Any

import torch
import torch.distributed as dist
from torch.utils._pytree import tree_map
from torch.utils.data import DataLoader


class CycledDataLoader:
    """
    A data loader that cycles through the data when it reaches the end.

    Args:
        dataloader (DataLoader): The original data loader.

    Attributes:
        dataloader (DataLoader): The original data loader.
        count (int): The number of times the data loader has been cycled.
        dataloader_iter (iterable): The iterator for the data loader.

    Methods:
        next(): Returns the next batch of data from the data loader, cycling through the data if necessary.
    """

    def __init__(
        self,
        dataloader: DataLoader,
    ) -> None:
        self.dataloader = dataloader

        self.count = 0
        self.dataloader_iter = None

    def next(self):
        """
        Returns the next batch of data from the data loader, cycling through the data if necessary.

        Returns:
            Any: The next batch of data from the data loader.
        """
        # defer initialization
        if self.dataloader_iter is None:
            self.dataloader_iter = iter(self.dataloader)

        self.count += 1
        try:
            return next(self.dataloader_iter)
        except StopIteration:
            self.count = 0
            self.dataloader_iter = iter(self.dataloader)
            return next(self.dataloader_iter)


def is_rank_0() -> bool:
    """
    Check if the current process is the rank 0 process in a distributed training setup.

    Returns:
        bool: True if the current process is the rank 0 process, False otherwise.
    """
    return not dist.is_initialized() or dist.get_rank() == 0


def to_device(x: Any, device: torch.device) -> Any:
    """
    Move the input tensor or nested structure of tensors to the specified device.

    Args:
        x (Any): The input tensor or nested structure of tensors.
        device (torch.device): The target device to move the tensors to.

    Returns:
        Any: The tensor or nested structure of tensors moved to the target device.
    """

    def _to(t: Any):
        if isinstance(t, torch.Tensor):
            return t.to(device)
        return t

    return tree_map(_to, x)


def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
    """
    Perform all-reduce operation on the given tensor and compute the mean across all processes.

    Args:
        tensor (torch.Tensor): The input tensor to be reduced.

    Returns:
        torch.Tensor: The reduced tensor with mean computed across all processes.
    """
    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
    tensor.div_(dist.get_world_size())
    return tensor


def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
    """
    Performs an all-reduce operation to sum the values of the given tensor across all processes.

    Args:
        tensor (torch.Tensor): The input tensor to be reduced.

    Returns:
        torch.Tensor: The reduced tensor with the sum of values across all processes.
    """
    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
    return tensor
[ColossalChat] Update RLHF V2 (#5286) * Add dpo. Fix sft, ppo, lora. Refactor all * fix and tested ppo * 2 nd round refactor * add ci tests * fix ci * fix ci * fix readme, style * fix readme style * fix style, fix benchmark * reproduce benchmark result, remove useless files * rename to ColossalChat * use new image * fix ci workflow * fix ci * use local model/tokenizer for ci tests * fix ci * fix ci * fix ci * fix ci timeout * fix rm progress bar. fix ci timeout * fix ci * fix ci typo * remove 3d plugin from ci temporary * test environment * cannot save optimizer * support chat template * fix readme * fix path * test ci locally * restore build_or_pr * fix ci data path * fix benchmark * fix ci, move ci tests to 3080, disable fast tokenizer * move ci to 85 * support flash attention 2 * add all-in-one data preparation script. Fix colossal-llama2-chat chat template * add hardware requirements * move ci test data * fix save_model, add unwrap * fix missing bos * fix missing bos; support grad accumulation with gemini * fix ci * fix ci * fix ci * fix llama2 chat template config * debug sft * debug sft * fix colossalai version requirement * fix ci * add sanity check to prevent NaN loss * fix requirements * add dummy data generation script * add dummy data generation script * add dummy data generation script * add dummy data generation script * update readme * update readme * update readme and ignore * fix logger bug * support parallel_output * modify data preparation logic * fix tokenization * update lr * fix inference * run pre-commit --------- Co-authored-by: Tong Li <tong.li352711588@gmail.com> 2024-03-29 06:12:29 +00:00			`"""`
			`Training utilities for Coati.`
			`"""`
[pre-commit.ci] pre-commit autoupdate (#5572) * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/PyCQA/autoflake: v2.2.1 → v2.3.1](https://github.com/PyCQA/autoflake/compare/v2.2.1...v2.3.1) - [github.com/pycqa/isort: 5.12.0 → 5.13.2](https://github.com/pycqa/isort/compare/5.12.0...5.13.2) - [github.com/psf/black-pre-commit-mirror: 23.9.1 → 24.4.2](https://github.com/psf/black-pre-commit-mirror/compare/23.9.1...24.4.2) - [github.com/pre-commit/mirrors-clang-format: v13.0.1 → v18.1.7](https://github.com/pre-commit/mirrors-clang-format/compare/v13.0.1...v18.1.7) - [github.com/pre-commit/pre-commit-hooks: v4.3.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.3.0...v4.6.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 2024-07-01 09:16:41 +00:00
[ColossalChat] Update RLHF V2 (#5286) * Add dpo. Fix sft, ppo, lora. Refactor all * fix and tested ppo * 2 nd round refactor * add ci tests * fix ci * fix ci * fix readme, style * fix readme style * fix style, fix benchmark * reproduce benchmark result, remove useless files * rename to ColossalChat * use new image * fix ci workflow * fix ci * use local model/tokenizer for ci tests * fix ci * fix ci * fix ci * fix ci timeout * fix rm progress bar. fix ci timeout * fix ci * fix ci typo * remove 3d plugin from ci temporary * test environment * cannot save optimizer * support chat template * fix readme * fix path * test ci locally * restore build_or_pr * fix ci data path * fix benchmark * fix ci, move ci tests to 3080, disable fast tokenizer * move ci to 85 * support flash attention 2 * add all-in-one data preparation script. Fix colossal-llama2-chat chat template * add hardware requirements * move ci test data * fix save_model, add unwrap * fix missing bos * fix missing bos; support grad accumulation with gemini * fix ci * fix ci * fix ci * fix llama2 chat template config * debug sft * debug sft * fix colossalai version requirement * fix ci * add sanity check to prevent NaN loss * fix requirements * add dummy data generation script * add dummy data generation script * add dummy data generation script * add dummy data generation script * update readme * update readme * update readme and ignore * fix logger bug * support parallel_output * modify data preparation logic * fix tokenization * update lr * fix inference * run pre-commit --------- Co-authored-by: Tong Li <tong.li352711588@gmail.com> 2024-03-29 06:12:29 +00:00			`from typing import Any`

			`import torch`
			`import torch.distributed as dist`
			`from torch.utils._pytree import tree_map`
			`from torch.utils.data import DataLoader`


			`class CycledDataLoader:`
			`"""`
			`A data loader that cycles through the data when it reaches the end.`

			`Args:`
			`dataloader (DataLoader): The original data loader.`

			`Attributes:`
			`dataloader (DataLoader): The original data loader.`
			`count (int): The number of times the data loader has been cycled.`
			`dataloader_iter (iterable): The iterator for the data loader.`

			`Methods:`
			`next(): Returns the next batch of data from the data loader, cycling through the data if necessary.`
			`"""`

			`def __init__(`
			`self,`
			`dataloader: DataLoader,`
			`) -> None:`
			`self.dataloader = dataloader`

			`self.count = 0`
			`self.dataloader_iter = None`

			`def next(self):`
			`"""`
			`Returns the next batch of data from the data loader, cycling through the data if necessary.`

			`Returns:`
			`Any: The next batch of data from the data loader.`
			`"""`
			`# defer initialization`
			`if self.dataloader_iter is None:`
			`self.dataloader_iter = iter(self.dataloader)`

			`self.count += 1`
			`try:`
			`return next(self.dataloader_iter)`
			`except StopIteration:`
			`self.count = 0`
			`self.dataloader_iter = iter(self.dataloader)`
			`return next(self.dataloader_iter)`


			`def is_rank_0() -> bool:`
			`"""`
			`Check if the current process is the rank 0 process in a distributed training setup.`

			`Returns:`
			`bool: True if the current process is the rank 0 process, False otherwise.`
			`"""`
			`return not dist.is_initialized() or dist.get_rank() == 0`


			`def to_device(x: Any, device: torch.device) -> Any:`
			`"""`
			`Move the input tensor or nested structure of tensors to the specified device.`

			`Args:`
			`x (Any): The input tensor or nested structure of tensors.`
			`device (torch.device): The target device to move the tensors to.`

			`Returns:`
			`Any: The tensor or nested structure of tensors moved to the target device.`
			`"""`

			`def _to(t: Any):`
			`if isinstance(t, torch.Tensor):`
			`return t.to(device)`
			`return t`

			`return tree_map(_to, x)`


			`def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:`
			`"""`
			`Perform all-reduce operation on the given tensor and compute the mean across all processes.`

			`Args:`
			`tensor (torch.Tensor): The input tensor to be reduced.`

			`Returns:`
			`torch.Tensor: The reduced tensor with mean computed across all processes.`
			`"""`
			`dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)`
			`tensor.div_(dist.get_world_size())`
			`return tensor`


			`def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:`
			`"""`
			`Performs an all-reduce operation to sum the values of the given tensor across all processes.`

			`Args:`
			`tensor (torch.Tensor): The input tensor to be reduced.`

			`Returns:`
			`torch.Tensor: The reduced tensor with the sum of values across all processes.`
			`"""`
			`dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)`
			`return tensor`