ColossalAI/colossalai/nn/optimizer/galore.py

""" adapted from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/adamw8bit.py"""

import warnings
from typing import List

import torch
from bitsandbytes.optim.optimizer import Optimizer2State
from torch._C import _LinAlgError


def get_galore_param_groups(
    model, weight_decay, rank=256, update_proj_gap=200, scale=0.25, proj_type="std"
) -> List[dict]:
    """
    It's advised to use this instead of manually specifying which param groups
    to apply GaLore on.
    """
    galore_params = []
    non_galore = []
    no_decay_params = []
    no_decay = ["bias", "LayerNorm.weight"]

    for name, param in model.named_parameters():
        # Only make sense to do SVD on 2d gradient matrices
        # e.g. nn.Linear, VocabEmbedding, etc.
        if any(nd in name for nd in no_decay):
            no_decay_params.append(param)
        elif param.dim() == 2:
            galore_params.append(param)
        else:
            non_galore.append(param)

    param_groups = [
        {
            "params": galore_params,
            "rank": rank,
            "update_proj_gap": update_proj_gap,
            "scale": scale,
            "proj_type": proj_type,
            "weight_decay": weight_decay,
        },
        {"params": non_galore, "weight_decay": weight_decay},
        {"params": no_decay_params, "weight_decay": 0.0},
    ]

    return param_groups


def make_low_rank_buffer(p, grad):
    """For compatibility with bitsandbytes's update_step, we need an empty low-rank
    param update buffer to avoid mutating original params.
    TODO: optimize by reusing the memory for p.grad? Need to modify bitsandbytes?
    """
    p.saved_data = p.data.clone()
    # p.data = grad.clone().to(p.data.dtype).to(p.data.device)
    p.data = torch.zeros_like(grad, device=grad.device, dtype=grad.dtype)
    # p.data.zero_()
    p.grad = grad


class GaLoreProjector:
    def __init__(self, rank, verbose=False, update_proj_gap=200, scale=1.0, proj_type="std"):
        self.rank = rank
        self.verbose = verbose
        self.update_proj_gap = update_proj_gap
        self.scale = scale
        self.ortho_matrix = None
        self.proj_type = proj_type
        self.svd_type = None

    def project(self, full_rank_grad, iter):
        dim = full_rank_grad.dim()
        if dim != 2:
            warnings.warn(
                f"Warning: You shouldn't specify projection rank for {dim}D params in param_groups. Skipping SVD."
            )
            return full_rank_grad

        m, n = full_rank_grad.shape  # For ZeRO sharded grads
        if self.proj_type == "std":
            # Project the lower dim to minimize information loss
            if self.svd_type is None:
                self.svd_type = "right" if m >= n else "left"
            # SVD step
            if self.ortho_matrix is None or iter % self.update_proj_gap == 0:
                self.ortho_matrix = self.get_orthogonal_matrix(full_rank_grad, self.rank, type=self.svd_type)
            if self.svd_type == "right":
                low_rank_grad = torch.matmul(full_rank_grad, self.ortho_matrix.t()[:n])
            else:
                low_rank_grad = torch.matmul(self.ortho_matrix.t()[:, :m], full_rank_grad)

        elif self.proj_type == "reverse_std":
            if self.svd_type is None:
                self.svd_type = "left" if m >= n else "right"
            # SVD step
            if self.ortho_matrix is None or iter % self.update_proj_gap == 0:
                self.ortho_matrix = self.get_orthogonal_matrix(full_rank_grad, self.rank, type=self.svd_type)

            if self.svd_type == "left":
                low_rank_grad = torch.matmul(self.ortho_matrix.t()[:, :m], full_rank_grad)
            else:
                low_rank_grad = torch.matmul(full_rank_grad, self.ortho_matrix.t()[:n])
        return low_rank_grad

    def project_back(self, low_rank_grad):
        if low_rank_grad.dim() != 2:
            return

        m, n = low_rank_grad.shape
        if self.svd_type == "right":
            full_rank_grad = torch.matmul(low_rank_grad, self.ortho_matrix[:n])
        else:
            full_rank_grad = torch.matmul(self.ortho_matrix[:, :m], low_rank_grad)

        return full_rank_grad * self.scale

    # svd decomposition
    def get_orthogonal_matrix(self, weights, rank, type):
        module_params = weights

        if module_params.data.dtype != torch.float:
            float_data = False
            original_type = module_params.data.dtype
            original_device = module_params.data.device
            matrix = module_params.data.float()
        else:
            float_data = True
            matrix = module_params.data

        # TODO: redo SVD in the next step.
        if matrix.isnan().any():
            print(f"{__file__}: skipping SVD due to NaN matrix")
            return self.ortho_matrix
        try:
            U, s, Vh = torch.linalg.svd(matrix, full_matrices=False)
        except _LinAlgError as e:
            print(f"{__file__}: skipping SVD due to {e}")
            return self.ortho_matrix

        # make the smaller matrix always to be orthogonal matrix
        if type == "right":
            B = Vh[:rank, :]

            if not float_data:
                B = B.to(original_device).type(original_type)
            return B
        elif type == "left":
            A = U[:, :rank]
            if not float_data:
                A = A.to(original_device).type(original_type)
            return A
        elif type == "full":
            A = U[:, :rank]
            B = Vh[:rank, :]
            if not float_data:
                A = A.to(original_device).type(original_type)
                B = B.to(original_device).type(original_type)
            return [A, B]
        else:
            raise ValueError("type should be left, right or full")


class GaLoreAdamW8bit(Optimizer2State):
    r"""Implements Galore, a optimizer-agonistic gradient compression technique on 8-bit AdamW.
    Proposed in `GaLore: Memory-Efficient LLM Training by Gradient Low-Rank Projection`. It compresses
    gradient via low-rank projection and is claimed to be insensitive to hyperparams like lr.
    https://arxiv.org/abs/2403.03507

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-6)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.01)
        nbits (int): The number of bits of optim states. Only 32 and 8 are supported.
        min_8bit_size (`int`, defaults to 4096):
            The minimum number of elements of the parameter tensors for 8-bit optimization.
        percentile_clipping (`int`, defaults to 100):
            Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
        block_wise (`bool`, defaults to `True`):
            Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
        is_paged (`bool`, defaults to `False`):
            Whether the optimizer is a paged optimizer (handle memory spike via CPU-GPU transfer) or not.
        args (dict, optional): quantization-related arguments. If passed, will override all quantization args above.
    Example:

    """

    def __init__(
        self,
        params,
        lr=1e-2,
        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=1e-2,
        nbits=8,
        min_8bit_size=4096,
        percentile_clipping=100,
        block_wise=True,
        is_paged=False,
        args=None,
    ):
        super().__init__(
            "adam",
            params,
            lr,
            betas,
            eps,
            weight_decay,
            optim_bits=nbits,
            args=args,
            min_8bit_size=min_8bit_size,
            percentile_clipping=percentile_clipping,
            block_wise=block_wise,
            is_paged=is_paged,
        )

        proj_none = all(["rank" not in group for group in self.param_groups])
        if proj_none:
            warnings.warn(
                "Will not apply GaLore as no rank is specified. Or did you forget to? Try get_galore_param_groups"
            )

        # Defaults from the paper
        for group in self.param_groups:
            if "rank" in group:
                group["update_proj_gap"] = group.get("update_proj_gap", 200)
                group["proj_type"] = group.get("proj_type", "std")
                group["scale"] = group.get("scale", 0.25)

    @torch.no_grad()
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """

        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        if not self.initialized:
            self.check_overrides()
            self.to_gpu()  # needed for fairseq pure fp16 training
            self.initialized = True

        for gindex, group in enumerate(self.param_groups):
            for pindex, p in enumerate(group["params"]):
                if p.grad is None:
                    continue
                if p is self.param_groups[0]["params"][0]:
                    torch.save(p.grad, "grad.pt")
                state = self.state[p]

                if "step" not in state:
                    state["step"] = 0

                # GaLore Projection
                if "rank" in group:
                    if "projector" not in state:
                        state["projector"] = GaLoreProjector(
                            group["rank"],
                            scale=group["scale"],
                            update_proj_gap=group["update_proj_gap"],
                            proj_type=group["proj_type"],
                        )

                    if "weight_decay" in group and group["weight_decay"] > 0:
                        # ensure that the weight decay is not applied to the norm grad
                        group["weight_decay_saved"] = group["weight_decay"]
                        group["weight_decay"] = 0

                    grad = state["projector"].project(p.grad, state["step"])
                    make_low_rank_buffer(p, grad)

                if "state1" not in state:
                    self.init_state(group, p, gindex, pindex)

                # p.grad = p.grad.contiguous() # avoid bitsandbytes update error
                # Prefetch if paged
                self.prefetch_state(p)
                # Adam update step using the buffer
                self.update_step(group, p, gindex, pindex)
                torch.cuda.synchronize()

                # GaLore Projection Back
                if "rank" in group:
                    if p is self.param_groups[0]["params"][1]:
                        pass
                    update = state["projector"].project_back(p.data)
                    p.data = p.saved_data.add_(update)

                # apply weight decay
                if "weight_decay_saved" in group:
                    p.data.add_(p.data, alpha=-group["lr"] * group["weight_decay_saved"])
                    group["weight_decay"] = group["weight_decay_saved"]
                    del group["weight_decay_saved"]

        if self.is_paged:
            # all paged operation are asynchronous, we need
            # to sync to make sure all tensors are in the right state
            torch.cuda.synchronize()

        return loss

    def __del__(self):
        """Avoid buffer memory leak"""
        for group in self.param_groups:
            for p in group["params"]:
                if hasattr(p, "saved_data"):
                    del p.saved_data