ColossalAI/colossalai/zero/gemini/chunk/search_utils.py

import math
from typing import Dict, List, Optional, Tuple

import numpy as np
import torch.distributed as dist
import torch.nn as nn

from colossalai.tensor import ColoParameter
from colossalai.utils import is_ddp_ignored
from colossalai.zero.gemini.memory_tracer import MemStats, OrderedParamGenerator


def _filter_exlarge_params(model: nn.Module, size_dict: Dict[int, List[int]]) -> None:
    """_filter_exlarge_params

    Filter those parameters whose size is too large (more than 3x standard deviations) from others.

    Args:
        model (nn.Module): the model.
        size_dict (Dict[int, List[int]]): the size dict of parameters.
    """
    agg_size_list = []
    for key in size_dict:
        agg_size_list.extend(size_dict[key])

    if len(agg_size_list) == 0:
        return

    params_size_arr = np.array(agg_size_list)

    std = np.std(params_size_arr)
    mean = np.mean(params_size_arr)
    upper_limit = mean + 3 * std

    for key in size_dict:
        org_list = size_dict[key]
        size_dict[key] = list(filter(lambda x: x <= upper_limit, org_list))


def _get_unused_byte(size_list: List[int], chunk_size: int) -> int:
    """_get_unused_byte

    Get unused byte for a certain chunk size.

    Args:
        size_list (List[int]): the size list of parameters.
        chunk_size (int): the chunk size.

    Returns:
        int: the unused byte.
    """
    acc = 0
    left = 0
    for s in size_list:
        if s > left:
            acc += left
            left = chunk_size
        left -= s
    return left + acc


def _tensor_numel(local_param: ColoParameter, strict_ddp_flag: bool) -> int:
    """_tensor_numel

    Get the number of elements of a tensor.

    Args:
        local_param (ColoParameter): The local parameter.
        strict_ddp_flag (bool): whether to enable the strict ddp mode.

    Returns:
        int: the number of elements.
    """
    if strict_ddp_flag and type(local_param) is ColoParameter:
        return local_param.numel_global()
    else:
        # if local_param is not ColoParameter, we assume it's replicated
        return local_param.numel()


def classify_params_by_dp_degree(param_order: OrderedParamGenerator,
                                 strict_ddp_flag: bool = False) -> Dict[int, List[ColoParameter]]:
    """classify_params_by_dp_degree

    Classify the parameters by their dp degree

    Args:
        param_order (OrderedParamGenerator): the order of param be vised
        strict_ddp_flag (bool, optional): whether to enable the strict ddp mode. Defaults to False.

    Returns:
        Dict[int, List[ColoParameter]]: a dict contains the classification results.
        The keys are dp_degrees and the values are parameters.
    """
    params_dict: Dict[int, List[ColoParameter]] = dict()
    for param in param_order.generate():
        # assert isinstance(param, ColoParameter), "please init model in the ColoInitContext"
        if is_ddp_ignored(param):
            continue

        if strict_ddp_flag or type(param) is not ColoParameter:
            # if model is not initialized with ColoInitContext, we assume it's replicated
            # TODO(ver217): integrate DTensor
            param_key = dist.get_world_size()
        else:
            param_key = param.process_group.dp_world_size()

        if param_key not in params_dict:
            params_dict[param_key] = []
        params_dict[param_key].append(param)

    return params_dict


def search_chunk_configuration(
        model: nn.Module,
        search_range_m: float,
        search_interval: int,    # hidden size is the best value for the interval
        min_chunk_size_m: float = 32,
        filter_exlarge_params: bool = True,
        strict_ddp_flag: bool = False,
        memstas: Optional[MemStats] = None) -> Tuple[Dict, int, int]:
    """search_chunk_configuration

    Search the chunk configuration for a model.

    Args:
        model (nn.Module): torch module
        search_range_m (float): searching range divided by 2^20.
        search_interval (int): searching interval.
        min_chunk_size_m (float, optional): the minimum size of a distributed chunk, divided by 2^20..
        filter_exlarge_params (bool, optional): filter extreme large parameters. Defaults to True.
        strict_ddp_flag (bool, optional): whether to enable the strict ddp mode.
            all parameters keep replicated in this mode.

    Returns:
        Tuple[Dict, int]: chunk config (a dict of dp_degree -> chunk init args) and its memory chunk waste in byte.
    """

    if memstas is not None:
        param_order = memstas.param_order()
    else:
        # build the param visited order right now
        param_order = OrderedParamGenerator()
        for p in model.parameters():
            param_order.append(p)

    search_range = round(search_range_m * 1024**2)
    min_chunk_size = round(min_chunk_size_m * 1024**2)
    assert search_range >= 0

    params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
    size_lcm = np.lcm.reduce(list(params_dict.keys()))
    config_dict: Dict[int, Dict] = dict()
    total_param_size = 0

    size_dict: Dict[int, List[int]] = dict()
    for dp_degree in params_dict:
        params_list = params_dict[dp_degree]
        size_list = [_tensor_numel(p, strict_ddp_flag) for p in params_list]
        group_acc_size = sum(size_list)
        total_param_size += group_acc_size

        # let small parameters keep gathered in CUDA all the time
        if group_acc_size < min_chunk_size:
            config_dict[dp_degree] = dict(chunk_size=group_acc_size, keep_gathered=True)
        else:
            size_dict[dp_degree] = size_list

    if filter_exlarge_params:
        _filter_exlarge_params(model, size_dict)

    max_size = min_chunk_size
    for key in size_dict:
        max_size = max(max_size, max(size_dict[key]))
    start_size = int(math.ceil(max_size / search_interval) * search_interval)

    min_chunk_waste = float('+inf')
    best_chunk_size = start_size

    for chunk_size in range(start_size, start_size + search_range + 1, search_interval):
        temp_waste = 0
        for key in size_dict:
            temp_waste += _get_unused_byte(size_dict[key], chunk_size)
        if temp_waste < min_chunk_waste:
            min_chunk_waste = temp_waste
            best_chunk_size = chunk_size

    # the chunk size needs to be divided by each groups sizes
    best_chunk_size = best_chunk_size + (-best_chunk_size % size_lcm)
    for dp_degree in params_dict:
        if dp_degree in config_dict:
            continue
        config_dict[dp_degree] = dict(chunk_size=best_chunk_size, keep_gathered=False)

    return config_dict, total_param_size, min_chunk_waste
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`import math`
[Gemini] chunk init using runtime visited param order (#2115) 2022-12-12 10:06:16 +00:00			`from typing import Dict, List, Optional, Tuple`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
			`import numpy as np`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`import torch.distributed as dist`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`import torch.nn as nn`

			`from colossalai.tensor import ColoParameter`
[ddp] add is_ddp_ignored (#2434) [ddp] rename to is_ddp_ignored 2023-01-11 04:22:45 +00:00			`from colossalai.utils import is_ddp_ignored`
[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00			`from colossalai.zero.gemini.memory_tracer import MemStats, OrderedParamGenerator`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00

			`def _filter_exlarge_params(model: nn.Module, size_dict: Dict[int, List[int]]) -> None:`
Add docstr for zero3 chunk search utils (#3572) 2023-04-17 04:44:17 +00:00			`"""_filter_exlarge_params`

[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`Filter those parameters whose size is too large (more than 3x standard deviations) from others.`
Add docstr for zero3 chunk search utils (#3572) 2023-04-17 04:44:17 +00:00
			`Args:`
			`model (nn.Module): the model.`
			`size_dict (Dict[int, List[int]]): the size dict of parameters.`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`"""`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`agg_size_list = []`
			`for key in size_dict:`
			`agg_size_list.extend(size_dict[key])`

			`if len(agg_size_list) == 0:`
			`return`

			`params_size_arr = np.array(agg_size_list)`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
			`std = np.std(params_size_arr)`
			`mean = np.mean(params_size_arr)`
			`upper_limit = mean + 3 * std`

			`for key in size_dict:`
			`org_list = size_dict[key]`
			`size_dict[key] = list(filter(lambda x: x <= upper_limit, org_list))`


			`def _get_unused_byte(size_list: List[int], chunk_size: int) -> int:`
Add docstr for zero3 chunk search utils (#3572) 2023-04-17 04:44:17 +00:00			`"""_get_unused_byte`

			`Get unused byte for a certain chunk size.`

			`Args:`
			`size_list (List[int]): the size list of parameters.`
			`chunk_size (int): the chunk size.`

			`Returns:`
			`int: the unused byte.`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`"""`
			`acc = 0`
			`left = 0`
			`for s in size_list:`
			`if s > left:`
			`acc += left`
			`left = chunk_size`
			`left -= s`
			`return left + acc`


Add docstr for zero3 chunk search utils (#3572) 2023-04-17 04:44:17 +00:00			`def _tensor_numel(local_param: ColoParameter, strict_ddp_flag: bool) -> int:`
			`"""_tensor_numel`

			`Get the number of elements of a tensor.`

			`Args:`
			`local_param (ColoParameter): The local parameter.`
			`strict_ddp_flag (bool): whether to enable the strict ddp mode.`

			`Returns:`
			`int: the number of elements.`
			`"""`
[gemini] gemini supports lazy init (#3379) * [gemini] fix nvme optimizer init * [gemini] gemini supports lazy init * [gemini] add init example * [gemini] add fool model * [zero] update gemini ddp * [zero] update init example * add chunk method * add chunk method * [lazyinit] fix lazy tensor tolist * [gemini] fix buffer materialization * [misc] remove useless file * [booster] update gemini plugin * [test] update gemini plugin test * [test] fix gemini plugin test * [gemini] fix import * [gemini] fix import * [lazyinit] use new metatensor * [lazyinit] use new metatensor * [lazyinit] fix __set__ method 2023-04-12 08:03:25 +00:00			`if strict_ddp_flag and type(local_param) is ColoParameter:`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`return local_param.numel_global()`
			`else:`
[gemini] gemini supports lazy init (#3379) * [gemini] fix nvme optimizer init * [gemini] gemini supports lazy init * [gemini] add init example * [gemini] add fool model * [zero] update gemini ddp * [zero] update init example * add chunk method * add chunk method * [lazyinit] fix lazy tensor tolist * [gemini] fix buffer materialization * [misc] remove useless file * [booster] update gemini plugin * [test] update gemini plugin test * [test] fix gemini plugin test * [gemini] fix import * [gemini] fix import * [lazyinit] use new metatensor * [lazyinit] use new metatensor * [lazyinit] fix __set__ method 2023-04-12 08:03:25 +00:00			`# if local_param is not ColoParameter, we assume it's replicated`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`return local_param.numel()`


			`def classify_params_by_dp_degree(param_order: OrderedParamGenerator,`
			`strict_ddp_flag: bool = False) -> Dict[int, List[ColoParameter]]:`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`"""classify_params_by_dp_degree`

			`Classify the parameters by their dp degree`

			`Args:`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2023-05-23 07:28:20 +00:00			`param_order (OrderedParamGenerator): the order of param be vised`
Add docstr for zero3 chunk search utils (#3572) 2023-04-17 04:44:17 +00:00			`strict_ddp_flag (bool, optional): whether to enable the strict ddp mode. Defaults to False.`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00
			`Returns:`
			`Dict[int, List[ColoParameter]]: a dict contains the classification results.`
			`The keys are dp_degrees and the values are parameters.`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`"""`
			`params_dict: Dict[int, List[ColoParameter]] = dict()`
[Gemini] chunk init use OrderedParamGenerator (#2110) 2022-12-11 13:41:13 +00:00			`for param in param_order.generate():`
[gemini] gemini supports lazy init (#3379) * [gemini] fix nvme optimizer init * [gemini] gemini supports lazy init * [gemini] add init example * [gemini] add fool model * [zero] update gemini ddp * [zero] update init example * add chunk method * add chunk method * [lazyinit] fix lazy tensor tolist * [gemini] fix buffer materialization * [misc] remove useless file * [booster] update gemini plugin * [test] update gemini plugin test * [test] fix gemini plugin test * [gemini] fix import * [gemini] fix import * [lazyinit] use new metatensor * [lazyinit] use new metatensor * [lazyinit] fix __set__ method 2023-04-12 08:03:25 +00:00			`# assert isinstance(param, ColoParameter), "please init model in the ColoInitContext"`
[ddp] add is_ddp_ignored (#2434) [ddp] rename to is_ddp_ignored 2023-01-11 04:22:45 +00:00			`if is_ddp_ignored(param):`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`continue`

[gemini] gemini supports lazy init (#3379) * [gemini] fix nvme optimizer init * [gemini] gemini supports lazy init * [gemini] add init example * [gemini] add fool model * [zero] update gemini ddp * [zero] update init example * add chunk method * add chunk method * [lazyinit] fix lazy tensor tolist * [gemini] fix buffer materialization * [misc] remove useless file * [booster] update gemini plugin * [test] update gemini plugin test * [test] fix gemini plugin test * [gemini] fix import * [gemini] fix import * [lazyinit] use new metatensor * [lazyinit] use new metatensor * [lazyinit] fix __set__ method 2023-04-12 08:03:25 +00:00			`if strict_ddp_flag or type(param) is not ColoParameter:`
			`# if model is not initialized with ColoInitContext, we assume it's replicated`
			`# TODO(ver217): integrate DTensor`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`param_key = dist.get_world_size()`
			`else:`
			`param_key = param.process_group.dp_world_size()`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
			`if param_key not in params_dict:`
			`params_dict[param_key] = []`
			`params_dict[param_key].append(param)`

			`return params_dict`


			`def search_chunk_configuration(`
			`model: nn.Module,`
[gemini] fix argument naming during chunk configuration searching 2023-06-25 05:34:15 +00:00			`search_range_m: float,`
			`search_interval: int, # hidden size is the best value for the interval`
			`min_chunk_size_m: float = 32,`
[Gemini] chunk init using runtime visited param order (#2115) 2022-12-12 10:06:16 +00:00			`filter_exlarge_params: bool = True,`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`strict_ddp_flag: bool = False,`
			`memstas: Optional[MemStats] = None) -> Tuple[Dict, int, int]:`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`"""search_chunk_configuration`

Add docstr for zero3 chunk search utils (#3572) 2023-04-17 04:44:17 +00:00			`Search the chunk configuration for a model.`

[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`Args:`
			`model (nn.Module): torch module`
[gemini] fix argument naming during chunk configuration searching 2023-06-25 05:34:15 +00:00			`search_range_m (float): searching range divided by 2^20.`
			`search_interval (int): searching interval.`
			`min_chunk_size_m (float, optional): the minimum size of a distributed chunk, divided by 2^20..`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`filter_exlarge_params (bool, optional): filter extreme large parameters. Defaults to True.`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`strict_ddp_flag (bool, optional): whether to enable the strict ddp mode.`
			`all parameters keep replicated in this mode.`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00
			`Returns:`
[NFC] update chunk manager API (#2119) 2022-12-12 08:57:22 +00:00			`Tuple[Dict, int]: chunk config (a dict of dp_degree -> chunk init args) and its memory chunk waste in byte.`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`"""`

[Gemini] chunk init using runtime visited param order (#2115) 2022-12-12 10:06:16 +00:00			`if memstas is not None:`
			`param_order = memstas.param_order()`
			`else:`
			`# build the param visited order right now`
			`param_order = OrderedParamGenerator()`
			`for p in model.parameters():`
			`param_order.append(p)`
[Gemini] chunk init use OrderedParamGenerator (#2110) 2022-12-11 13:41:13 +00:00
[gemini] fix argument naming during chunk configuration searching 2023-06-25 05:34:15 +00:00			`search_range = round(search_range_m * 1024**2)`
			`min_chunk_size = round(min_chunk_size_m * 1024**2)`
			`assert search_range >= 0`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)`
[hotfix] fix chunk size can not be divided (#2867) * [hotfix] fix chunk size can not be divided * [hotfix] use numpy for python3.8 2023-02-22 07:04:46 +00:00			`size_lcm = np.lcm.reduce(list(params_dict.keys()))`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`config_dict: Dict[int, Dict] = dict()`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`total_param_size = 0`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
			`size_dict: Dict[int, List[int]] = dict()`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`for dp_degree in params_dict:`
			`params_list = params_dict[dp_degree]`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`size_list = [_tensor_numel(p, strict_ddp_flag) for p in params_list]`
			`group_acc_size = sum(size_list)`
			`total_param_size += group_acc_size`

[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`# let small parameters keep gathered in CUDA all the time`
[gemini] fix argument naming during chunk configuration searching 2023-06-25 05:34:15 +00:00			`if group_acc_size < min_chunk_size:`
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`config_dict[dp_degree] = dict(chunk_size=group_acc_size, keep_gathered=True)`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`else:`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`size_dict[dp_degree] = size_list`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
			`if filter_exlarge_params:`
			`_filter_exlarge_params(model, size_dict)`

[gemini] fix argument naming during chunk configuration searching 2023-06-25 05:34:15 +00:00			`max_size = min_chunk_size`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`for key in size_dict:`
			`max_size = max(max_size, max(size_dict[key]))`
[gemini] fix argument naming during chunk configuration searching 2023-06-25 05:34:15 +00:00			`start_size = int(math.ceil(max_size / search_interval) * search_interval)`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
			`min_chunk_waste = float('+inf')`
			`best_chunk_size = start_size`

[gemini] fix argument naming during chunk configuration searching 2023-06-25 05:34:15 +00:00			`for chunk_size in range(start_size, start_size + search_range + 1, search_interval):`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`temp_waste = 0`
			`for key in size_dict:`
			`temp_waste += _get_unused_byte(size_dict[key], chunk_size)`
			`if temp_waste < min_chunk_waste:`
			`min_chunk_waste = temp_waste`
			`best_chunk_size = chunk_size`

[hotfix] fix chunk size can not be divided (#2867) * [hotfix] fix chunk size can not be divided * [hotfix] use numpy for python3.8 2023-02-22 07:04:46 +00:00			`# the chunk size needs to be divided by each groups sizes`
			`best_chunk_size = best_chunk_size + (-best_chunk_size % size_lcm)`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`for dp_degree in params_dict:`
			`if dp_degree in config_dict:`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00			`continue`
[Gemini] NFC, polish search_chunk_configuration (#2107) 2022-12-09 07:00:39 +00:00			`config_dict[dp_degree] = dict(chunk_size=best_chunk_size, keep_gathered=False)`
[zero] add chunk init function for users (#1729) * add chunk manager init function * fix unit tests * add comment * add flush=True 2022-10-18 08:31:22 +00:00
[gemini] update ddp strict mode (#2518) * [zero] add strict ddp mode for chunk init * [gemini] update gpt example 2023-01-28 06:35:25 +00:00			`return config_dict, total_param_size, min_chunk_waste`