From ef851d16c689169f4d1575cb7265da30ad387fc6 Mon Sep 17 00:00:00 2001 From: Sun Peng Date: Tue, 15 Aug 2023 18:55:10 +0800 Subject: [PATCH 1/8] Feat/optimizer (#194) * feat(optimier.py): reduce memory footprint and avoid _check_overflow call * feat(optimier.py): reduce memory footprint and avoid _check_overflow call * feat(optimizer.py): overlap compute norm with allreduce * update var and function name * update function compute norm (#197) Co-authored-by: ChenQiaoling00 * feat(optimizer/hybrid_zero_optim.py): overlap gradients last bucket allreduce and compute norm (#196) * support gradients allreduce and compute norm overlap * fix para set error * remove timer cal_norm for testing * feat(optimizer/hybrid_zero_optim.py): support group global norm * format(lint): fix lint error * feat(optimizer/store.py): update code based on comment --------- Co-authored-by: ChenQiaoling00 Co-authored-by: huangting4201 <1538303371@qq.com> --- .gitignore | 1 + internlm/model/norm.py | 18 +- internlm/model/utils.py | 12 +- .../solver/optimizer/hybrid_zero_optim.py | 166 ++++++++++-------- internlm/solver/optimizer/store.py | 34 ++++ internlm/solver/optimizer/utils.py | 19 +- train.py | 10 +- 7 files changed, 168 insertions(+), 92 deletions(-) diff --git a/.gitignore b/.gitignore index 44b0e77..ca9ea1a 100644 --- a/.gitignore +++ b/.gitignore @@ -115,6 +115,7 @@ venv.bak/ *.pkl *.pkl.json *.log.json +*.trace.json docs/modelzoo_statistics.md mmdet/.mim work_dirs/ diff --git a/internlm/model/norm.py b/internlm/model/norm.py index edbbbfe..6598e17 100644 --- a/internlm/model/norm.py +++ b/internlm/model/norm.py @@ -7,23 +7,25 @@ from torch.nn import init from torch.nn.parameter import Parameter -def manual_rms_norm(input, normalized_shape, weight, eps): +def manual_rms_norm(my_input, normalized_shape, weight, eps): # layer norm should always be calculated in float32 dims = tuple(i for i in range(-1, -len(normalized_shape) - 1, -1)) - variance = input.to(torch.float32).pow(2).mean(dims, keepdim=True) - input = input * torch.rsqrt(variance + eps) + variance = my_input.to(torch.float32).pow(2).mean(dims, keepdim=True) + my_input = my_input * torch.rsqrt(variance + eps) if weight is None: - return input + return my_input # convert into half-precision if necessary if weight.dtype in [torch.float16, torch.bfloat16]: - input = input.to(weight.dtype) + my_input = my_input.to(weight.dtype) - return weight * input + return weight * my_input class RMSNormTorch(torch.nn.Module): + """A custom PyTorch module for RMS normalization.""" + def __init__(self, normalized_shape, eps=1e-5): super().__init__() @@ -34,8 +36,8 @@ class RMSNormTorch(torch.nn.Module): self.weight = Parameter(torch.empty(*normalized_shape)) self.reset_parameters() - def forward(self, input: torch.Tensor): - return manual_rms_norm(input, self.normalized_shape, self.weight, self.eps) + def forward(self, _input: torch.Tensor): + return manual_rms_norm(_input, self.normalized_shape, self.weight, self.eps) def reset_parameters(self): init.ones_(self.weight) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 8b80af2..18845f0 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -88,15 +88,17 @@ def gather_forward_split_backward(input_, parallel_mode, dim): return _GatherForwardSplitBackward.apply(input_, parallel_mode, dim) -def linear_bias_wgrad_torch(input, grad_output, has_d_bias): - assert input.dtype == grad_output.dtype - grad_weight = torch.matmul(grad_output.t(), input) +def linear_bias_wgrad_torch(my_input, grad_output, has_d_bias): + assert my_input.dtype == grad_output.dtype + grad_weight = torch.matmul(grad_output.t(), my_input) grad_bias = grad_output.sum(dim=0) if has_d_bias else None return grad_weight, grad_bias # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py class FusedDenseFuncTorch(FusedDenseFunc): + """A custom PyTorch module extending FusedDenseFunc.""" + @staticmethod @custom_bwd def backward(ctx, grad_output, *args): @@ -173,8 +175,8 @@ class _SplitForwardGatherBackward(torch.autograd.Function): """ @staticmethod - def symbolic(graph, input_): - return _split(input_) + def symbolic(input_): + return _split(input_, parallel_mode=None) @staticmethod def forward(ctx, input_, parallel_mode, dim): diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 9d42a98..655d06c 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -178,6 +178,7 @@ class HybridZeroOptimizer(BaseOptimizer): if len(params) != 0: self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params) for param in params: + setattr(param, "group_id", group_id) self._param_store.set_param_to_rank(param, rank) # move to cpu to make room to create the flat tensor @@ -317,7 +318,7 @@ class HybridZeroOptimizer(BaseOptimizer): # if full, will reduce the grads already in the bucket # after reduction, the bucket will be empty if self._bucket_store.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size: - self._reduce_grads_stored_in_bucket(reduce_rank) + self._reduce_grads_stored_in_bucket(reduce_rank, last_bucket=False) # the param must not be reduced to ensure correctness is_param_reduced = self._param_store.is_param_reduced(param) @@ -335,7 +336,7 @@ class HybridZeroOptimizer(BaseOptimizer): self._bucket_store.add_grad(param.grad, reduce_rank) self._bucket_store.add_param(param, reduce_rank) - def _reduce_grads_stored_in_bucket(self, reduce_rank=None): + def _reduce_grads_stored_in_bucket(self, reduce_rank=None, last_bucket=False): # reduce grads self._reduce_grads_by_rank( reduce_rank=reduce_rank, @@ -343,30 +344,27 @@ class HybridZeroOptimizer(BaseOptimizer): bucket_size=self._bucket_store.num_elements_in_bucket(reduce_rank), ) - # use communication stream if overlapping - # communication with computation - if self._overlap_communication: - stream = self._comm_stream - else: - stream = torch.cuda.current_stream() + params_in_bucket = self._bucket_store.get_param(reduce_rank=reduce_rank) - with torch.cuda.stream(stream): - params_in_bucket = self._bucket_store.get_param(reduce_rank=reduce_rank) + for param in params_in_bucket: + # the is_param_reduced flag should be False showing that + # this param is not reduced before calling self._reduce_grads_by_rank + is_param_reduced = self._param_store.is_param_reduced(param) - for param in params_in_bucket: - # the is_param_reduced flag should be False showing that - # this param is not reduced before calling self._reduce_grads_by_rank - is_param_reduced = self._param_store.is_param_reduced(param) + if is_param_reduced: + msg = ( + f"Parameter of size ({param.size()}) has been reduced, " + + "duplicate reduction will lead to arithmetic incorrectness" + ) + raise RuntimeError(msg) - if is_param_reduced: - msg = ( - f"Parameter of size ({param.size()}) has been reduced, " - + "duplicate reduction will lead to arithmetic incorrectness" - ) - raise RuntimeError(msg) + # update the flag + self._param_store.set_param_reduction_state(param, True) - # update the flag - self._param_store.set_param_reduction_state(param, True) + if self._param_store.belongs_to_current_rank(param): + self._param_store.add_reduced_param_for_compute_norm(param, last_bucket) + else: + self._param_store.add_previous_reduced_param(param) self._bucket_store.reset_by_rank(reduce_rank) @@ -385,9 +383,9 @@ class HybridZeroOptimizer(BaseOptimizer): def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank): if self._overlap_communication: - torch.cuda.synchronize() - self._param_store.clear_grads_of_previous_reduced_params() stream = self._comm_stream + stream.synchronize() + self._param_store.clear_grads_of_previous_reduced_params() else: stream = torch.cuda.current_stream() @@ -421,6 +419,7 @@ class HybridZeroOptimizer(BaseOptimizer): reduction_states = self._param_store.get_param_reduction_states() for tensor, _ in reduction_states.items(): reduction_states[tensor] = False + self._param_store.reset_reduced_data_for_compute_norm() # accumulate gradient avg_gradients = self._grad_store._averaged_gradients @@ -469,6 +468,30 @@ class HybridZeroOptimizer(BaseOptimizer): # Gradients may not be fully synchronized here. + def _compute_norm_with_stage( + self, + group_id: int = 0, + last_bucket: bool = False, + last_stage: bool = False, + previous_norm=None, + ): + # compute norm for gradients that have been reduced + params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket) + if len(params) == 0: + grads = [self.padding_grad] + params = [self.padding_tensor] + + if self._clip_grad_norm > 0: + # this norm is before scaling, it will be very large + norm = compute_norm( + gradients=grads, + parameters=params, + last_stage=last_stage, + previous_norm=previous_norm, + ) + + return norm + def step(self, closure=None): """Performs a single optimization step. @@ -480,7 +503,6 @@ class HybridZeroOptimizer(BaseOptimizer): """ assert closure is None, "closure is not supported by step()" - timer("sync_grad").start() # if not overlapping communication (no reduction hook is attached) # we need to manually reduce these gradients if not self._overlap_communication: @@ -490,54 +512,49 @@ class HybridZeroOptimizer(BaseOptimizer): self._store_and_try_reduce_grads_by_bucket(param) # we need to reduce the gradients left in the communication bucket - self._reduce_grads_stored_in_bucket() + self._reduce_grads_stored_in_bucket(reduce_rank=None, last_bucket=True) + + # compute norm for gradients in the before bucket + groups_norms = [] + for group_id in range(self.num_param_groups): + groups_norms.append(self._compute_norm_with_stage(group_id=group_id)) # clear reduced grads if self._overlap_communication: - torch.cuda.synchronize() + # grads in the last bucket is reduced + self._comm_stream.synchronize() self._param_store.clear_grads_of_previous_reduced_params() + # compute norm for gradients in the last bucket + total_norms = [] + for group_id in range(self.num_param_groups): + total_norms.append( + self._compute_norm_with_stage( + group_id=group_id, last_bucket=True, last_stage=True, previous_norm=groups_norms[group_id] + ) + ) + + timer("sync_grad").start() self._sync_grad() timer("sync_grad").stop() - return self._step(closure=closure) + return self._step(closure=closure, norms=total_norms) - def _step(self, closure=None): + def _step(self, closure=None, norms=None): assert closure is None, "closure is not supported by step()" # check for overflow - found_inf = self._check_overflow() + found_inf = False + # if there is INF values in grades, compute_norm func would also returns -1 + # thus, we try to avoid call _check_overflow here + # found_inf = self._check_overflow() # Because you may encounter inf when computing norm - timer("cal_norm").start() - norm_groups = [] - for group_id in range(self.num_param_groups): - # compute norm - if self._zero_local_rank not in self.param_group_no_params_ranks[group_id]: - gradients = self._grad_store.get_averaged_gradients_by_group(group_id) - parameters = self._param_store.get_fp16_params_by_rank_group( - group_id=group_id, rank=self._zero_local_rank - ) - else: - # in order to prevent collection communication from hanging, - # we need to involve rank that are not assigned parameters in compute_norm(), - # so we give them a fp16 vector of 0 values. - gradients = [self.padding_grad] - parameters = [self.padding_tensor] - if self._clip_grad_norm > 0: - # this norm is before scaling, it will be very large - norm_group = compute_norm( - gradients=gradients, - parameters=parameters, - ) - if norm_group == -1: - timer("cal_norm").stop() - found_inf = True - break - norm_groups.append(norm_group) + if -1 in norms: + found_inf = True loss_scale = float(self.loss_scale.item()) # backup - if not gpc.config.model.dtype is torch.float32: + if gpc.config.model.dtype is not torch.float32: self.grad_scaler.update(found_inf) # update loss scale if overflow occurs if found_inf: @@ -550,7 +567,6 @@ class HybridZeroOptimizer(BaseOptimizer): # copy the grad of fp16 param to fp32 param single_grad_partition_groups = [] - global_norm = 0 for group_id in range(self.num_param_groups): # compute norm # The following operations are performed only on the rank to which parameters are assigned. @@ -559,13 +575,14 @@ class HybridZeroOptimizer(BaseOptimizer): # create flat gradient for the flat fp32 params gradients = self._grad_store.get_averaged_gradients_by_group(group_id) - flat_fp16_avg_grads = flatten(gradients) + with torch.no_grad(): + flat_fp16_avg_grads = flatten(gradients) self._grad_store.reset_average_gradients_by_group(group_id) - del gradients # release cuda memory + gradients = None # release cuda memory dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype) - del flat_fp16_avg_grads # release cuda memory + flat_fp16_avg_grads = None # release cuda memory param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape assert ( @@ -578,15 +595,16 @@ class HybridZeroOptimizer(BaseOptimizer): # unscale and clip grads # get the global norm + global_norm_groups = [] if self._clip_grad_norm > 0: - global_norm = sum(norm_groups) ** 0.5 + for norm in norms: + global_norm_groups.append(norm**0.5) # the following operations are performed only on the rank to which parameters are assigned. - if not gpc.config.model.dtype is torch.float32: + if gpc.config.model.dtype is not torch.float32: if len(single_grad_partition_groups) != 0: - self._unscale_and_clip_grads(single_grad_partition_groups, global_norm, loss_scale) + self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale) - timer("cal_norm").stop() # update the parameters timer("step").start() @@ -611,7 +629,7 @@ class HybridZeroOptimizer(BaseOptimizer): timer("step").stop() # update gradients may not be needed here, because the sync_params function is used in initialization, # so synchronization is maintained - return True, global_norm / loss_scale + return True, [global_norm / loss_scale for global_norm in global_norm_groups] def broadcast_params(self, overlap=False): handles = [] @@ -655,18 +673,20 @@ class HybridZeroOptimizer(BaseOptimizer): return self._found_overflow.item() > 0 - def _unscale_and_clip_grads(self, grad_groups_flat, total_norm, loss_scale): + def _unscale_and_clip_grads(self, grad_groups_flat, total_norm_groups, loss_scale): # compute combined scale factor for this group - combined_scale = loss_scale + combined_scale_groups = [] if self._clip_grad_norm > 0.0: # norm is in fact norm*scale - clip = ((total_norm / loss_scale) + 1e-6) / self._clip_grad_norm - if clip > 1.0: - combined_scale = clip * loss_scale + for group_id, total_norm in enumerate(total_norm_groups): + combined_scale_groups.append(loss_scale) + clip = ((total_norm / loss_scale) + 1e-6) / self._clip_grad_norm + if clip > 1.0: + combined_scale_groups[group_id] = clip * loss_scale - for grad in grad_groups_flat: - grad.data.mul_(1.0 / combined_scale) + for group_id, grad in enumerate(grad_groups_flat): + grad.data.mul_(1.0 / combined_scale_groups[group_id]) def clip_grad_norm(self, model, max_norm): # will conduct in the step() diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py index 2ef2e4f..2f475cc 100644 --- a/internlm/solver/optimizer/store.py +++ b/internlm/solver/optimizer/store.py @@ -152,6 +152,11 @@ class ParameterStore(BaseStore): self._is_param_reduced = dict() self._reduced_param = [] + self._former_bucket_reduced_param = {} + self._last_bucket_reduced_param = {} + self._former_bucket_reduced_grad = {} + self._last_bucket_reduced_grad = {} + def set_param_to_rank(self, tensor: Tensor, rank: int) -> None: """ Set the mapping between parameter to rank, each parameter should be owned by a rank. @@ -223,6 +228,35 @@ class ParameterStore(BaseStore): def add_previous_reduced_param(self, tensor): self._reduced_param.append(tensor) + def add_reduced_param_for_compute_norm(self, param, last_bucket=False): + group_id = getattr(param, "group_id") + if last_bucket: + if group_id not in self._last_bucket_reduced_param: + self._last_bucket_reduced_param[group_id] = [] + self._last_bucket_reduced_grad[group_id] = [] + + self._last_bucket_reduced_param[group_id].append(param) + self._last_bucket_reduced_grad[group_id].append(param.grad) + else: + if group_id not in self._former_bucket_reduced_param: + self._former_bucket_reduced_param[group_id] = [] + self._former_bucket_reduced_grad[group_id] = [] + + self._former_bucket_reduced_param[group_id].append(param) + self._former_bucket_reduced_grad[group_id].append(param.grad) + + def get_reduced_param_for_compute_norm(self, group_id=0, last_bucket=False): + if not last_bucket: + return self._former_bucket_reduced_param[group_id], self._former_bucket_reduced_grad[group_id] + else: + return self._last_bucket_reduced_param[group_id], self._last_bucket_reduced_grad[group_id] + + def reset_reduced_data_for_compute_norm(self): + self._former_bucket_reduced_param = {} + self._last_bucket_reduced_param = {} + self._former_bucket_reduced_grad = {} + self._last_bucket_reduced_grad = {} + def clear_grads_of_previous_reduced_params(self): if len(self._reduced_param) > 0: for param in self._reduced_param: diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 184441b..5174cb5 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -21,6 +21,7 @@ logger = get_logger(__file__) try: import amp_C from apex.multi_tensor_apply import multi_tensor_applier + APEX_AVAILABLE = True except (ModuleNotFoundError, ImportError): logger.warn("The torch implementation for cal_l2norm is slower than apex. Please note this!") @@ -162,6 +163,7 @@ def sync_param(flat_tensor, tensor_list): for p, q in zip(tensor_list, updated_params): p.data = q.data + def multi_tensor_l2norm_torch(tensor_list, per_tensor): # Convert tensor_list elements to torch.float32 tensor_list = [tensor.float() for tensor in tensor_list] @@ -175,6 +177,7 @@ def multi_tensor_l2norm_torch(tensor_list, per_tensor): return l2_norm, per_tensor_norm + def calc_l2_norm(grads): norm = 0.0 if len(grads) > 0: @@ -187,6 +190,7 @@ def calc_l2_norm(grads): norm, _ = multi_tensor_l2norm_torch(grads, False) return norm + def calc_lp(grads, norm_type): norm = 0.0 for grad in grads: @@ -195,7 +199,7 @@ def calc_lp(grads, norm_type): return norm -def compute_norm(gradients, parameters, norm_type=2): +def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, norm_type=2): """Get the norm Arguments: gradients (Iterable[Tensor]): The gradient value. @@ -215,6 +219,13 @@ def compute_norm(gradients, parameters, norm_type=2): if norm_type == inf: total_norm = max(g.data.abs().max() for g in gradients) total_norm_cuda = torch.FloatTensor([float(total_norm)], device=gradients[0].device) + + if last_stage is False: + return total_norm_cuda + + if previous_norm is not None: + total_norm_cuda = max(total_norm_cuda, previous_norm) + # Take max across all model-parallel GPUs. if gpc.get_world_size(ParallelMode.MODEL) > 1: dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.MODEL)) @@ -261,6 +272,12 @@ def compute_norm(gradients, parameters, norm_type=2): total_norm = tensor_parallel_norm + if last_stage is False: + return total_norm + + if previous_norm is not None: + total_norm = total_norm + previous_norm + # Sum across all model-parallel GPUs. if gpc.is_initialized(ParallelMode.MODEL): dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL)) diff --git a/train.py b/train.py index 7e87d05..6f56884 100644 --- a/train.py +++ b/train.py @@ -7,6 +7,7 @@ import traceback from functools import partial from typing import Iterable +import numpy as np import torch import torch.distributed as dist from torch import nn @@ -603,12 +604,12 @@ def main(args): trainer_result = trainer.step() assert trainer_result is not None - success_update, grad_norm = trainer_result + success_update, grad_norm_groups = trainer_result if success_update: # update parameters successfully train_state.step_count += 1 else: train_state.inf_nan_skip_batches += 1 # record the amount of updating parameters unsuccessfully. - if grad_norm == -99.0 and gpc.is_rank_for_log(): # -99.0 encodes a specific failure case + if -99.0 in grad_norm_groups and gpc.is_rank_for_log(): # -99.0 encodes a specific failure case logger.warning(f"Warning: skip parameter update at step {batch_count}.") send_alert_message( address=gpc.config.alert_address, message=f"Warning: skip parameter update at step {batch_count}." @@ -628,7 +629,7 @@ def main(args): trainer=trainer, start_time=start_time, loss=loss, - grad_norm=grad_norm, + grad_norm=np.array(grad_norm_groups), metric=metric, update_panel=uniscale_logger is not None, ) @@ -668,7 +669,6 @@ if __name__ == "__main__": main(args) except Exception: logger.error( - f"Raise exception from {hostname} with rank id: {gpc.get_global_rank()}", - exc_info=traceback.format_exc(), + f"Raise exception from {hostname} with rank id: {gpc.get_global_rank()}\n{traceback.format_exc()}", ) mm.monitor_exception(alert_address=gpc.config.alert_address, excp_info=traceback.format_exc()) From db13bc46bcb0affee798c4574276e2990f7a08ef Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 15 Aug 2023 20:09:54 +0800 Subject: [PATCH 2/8] fix(ci): fix ci train error (#199) --- internlm/core/communication/p2p.py | 2 ++ internlm/core/communication/utils.py | 1 - internlm/core/scheduler/pipeline_scheduler.py | 2 +- internlm/initialize/launch.py | 2 +- internlm/solver/optimizer/store.py | 4 ++++ 5 files changed, 8 insertions(+), 3 deletions(-) diff --git a/internlm/core/communication/p2p.py b/internlm/core/communication/p2p.py index 1a31e36..e707661 100644 --- a/internlm/core/communication/p2p.py +++ b/internlm/core/communication/p2p.py @@ -547,6 +547,8 @@ def send_backward_and_recv_next_backward_async( class AsynCommunicator: + """AsynCommunicator for managing async communication.""" + def __init__( self, tensor_to_send: Union[torch.Tensor, List[torch.Tensor]], diff --git a/internlm/core/communication/utils.py b/internlm/core/communication/utils.py index f3a808c..f413286 100644 --- a/internlm/core/communication/utils.py +++ b/internlm/core/communication/utils.py @@ -1,6 +1,5 @@ # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/communication -from functools import wraps from typing import List, Tuple, Union import torch diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index ac13073..ebdb374 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -139,7 +139,7 @@ class PipelineScheduler(BaseScheduler): and gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(ParallelMode.TENSOR) > 1 ) - + if gpc.config.model.sequence_parallel: self.scatter_gather_tensors = False diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index d3ea708..9955a96 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -139,7 +139,7 @@ def args_sanity_check(): gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/") if "snapshot_ckpt_folder" not in gpc.config.ckpt: - gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder), "snapshot") + gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot")) if "oss_snapshot_freq" not in gpc.config.ckpt and gpc.config.ckpt.checkpoint_every != float("inf"): gpc.config.ckpt._add_item("oss_snapshot_freq", gpc.config.ckpt.checkpoint_every / 2) diff --git a/internlm/solver/optimizer/store.py b/internlm/solver/optimizer/store.py index 2f475cc..05a44d2 100644 --- a/internlm/solver/optimizer/store.py +++ b/internlm/solver/optimizer/store.py @@ -247,8 +247,12 @@ class ParameterStore(BaseStore): def get_reduced_param_for_compute_norm(self, group_id=0, last_bucket=False): if not last_bucket: + if group_id not in self._former_bucket_reduced_param: + return [], [] return self._former_bucket_reduced_param[group_id], self._former_bucket_reduced_grad[group_id] else: + if group_id not in self._last_bucket_reduced_param: + return [], [] return self._last_bucket_reduced_param[group_id], self._last_bucket_reduced_grad[group_id] def reset_reduced_data_for_compute_norm(self): From 5f2381af623288d70f7d760efebba822e7616af0 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 16 Aug 2023 11:11:27 +0800 Subject: [PATCH 3/8] fix/ci train error (#200) * fix(ci): fix ci train error * fix(ci): fix ci train error * fix(ci): fix ci train error --- ci_scripts/train/ci_7B_sft.py | 1 + internlm/initialize/launch.py | 31 ++++++++++++++++--------------- internlm/utils/storage_manager.py | 7 +++---- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/ci_scripts/train/ci_7B_sft.py b/ci_scripts/train/ci_7B_sft.py index bc881c0..4f8477f 100644 --- a/ci_scripts/train/ci_7B_sft.py +++ b/ci_scripts/train/ci_7B_sft.py @@ -15,6 +15,7 @@ MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" SAVE_CKPT_FOLDER = "local:llm_ckpts" # LOAD_CKPT_FOLDER = "local:llm_ckpts/49" ckpt = dict( + enable_save_ckpt=True, # Path to save training ckpt. save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to continue training ckpt (load model weights and scheduler/context states). diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 9955a96..a2bc833 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -108,6 +108,9 @@ def args_sanity_check(): logger.info(f"valid_every: {data.valid_every}") # processing the checkpoint config + if "enable_save_ckpt" not in gpc.config.ckpt: + gpc.config.ckpt._add_item("enable_save_ckpt", False) + if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0: gpc.config.ckpt._add_item("checkpoint_every", float("inf")) @@ -125,18 +128,16 @@ def args_sanity_check(): if "async_upload" not in gpc.config.ckpt: gpc.config.ckpt._add_item("async_upload", False) - else: - if gpc.config.ckpt.async_upload: - assert "save_ckpt_folder" in gpc.config.ckpt - if "boto3:" not in gpc.config.ckpt.save_ckpt_folder: - if gpc.is_rank_for_log(): - logger.warning( - "Storing ckpt on file system does not support asynchronous storage, will use sync save!" - ) - gpc.config.ckpt.async_upload = False - else: - if "async_upload_tmp_folder" not in gpc.config.ckpt: - gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/") + + if "async_upload_tmp_folder" not in gpc.config.ckpt: + gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/") + + if gpc.config.ckpt.async_upload: + assert "save_ckpt_folder" in gpc.config.ckpt + if "boto3:" not in gpc.config.ckpt.save_ckpt_folder: + if gpc.is_rank_for_log(): + logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!") + gpc.config.ckpt.async_upload = False if "snapshot_ckpt_folder" not in gpc.config.ckpt: gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot")) @@ -149,14 +150,14 @@ def args_sanity_check(): gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None ), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time." - if "enable_save_ckpt" not in gpc.config.ckpt: - gpc.config.ckpt._add_item("enable_save_ckpt", False) - if gpc.is_rank_for_log(): logger.info("+" * 15 + " Ckpt Info " + "+" * 15) # pylint: disable=W1201 logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}") logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}") logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}") + logger.info(f"async_upload: {gpc.config.ckpt.async_upload}") + if gpc.config.ckpt.async_upload: + logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}") # initialization storage manager init_storage_manager(gpc.config.ckpt) diff --git a/internlm/utils/storage_manager.py b/internlm/utils/storage_manager.py index 481bd28..c9b42ea 100644 --- a/internlm/utils/storage_manager.py +++ b/internlm/utils/storage_manager.py @@ -383,12 +383,12 @@ class StorageManager(metaclass=SingletonMeta): } CLI_DICT = {} - def __init__(self, enable_save, tmp_local_folde="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None: + def __init__(self, enable_save, tmp_local_folder="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None: self._exception_list = [] self._to_be_del_files = [] self._async_stack = [] self.upload_count = 0 - self.tmp_local_folder = tmp_local_folde + self.tmp_local_folder = tmp_local_folder self.async_mode = async_mode self.has_warning = False @@ -523,7 +523,6 @@ class StorageManager(metaclass=SingletonMeta): pass async def _sync_tasks(self) -> Awaitable[None]: - if not self._async_stack: return @@ -591,7 +590,7 @@ def init_storage_manager(ckpt_config): global storage_manager storage_manager = StorageManager( ckpt_config.enable_save_ckpt, - tmp_local_folde=ckpt_config.async_upload_tmp_folder, + tmp_local_folder=ckpt_config.async_upload_tmp_folder, async_mode=ckpt_config.async_upload, ) From f3664bfbabc5e080e0ccd48b84b53353e8ee0d1a Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 16 Aug 2023 15:47:05 +0800 Subject: [PATCH 4/8] fix(train.py): fix scheduler metric hook skip error (#204) --- internlm/core/scheduler/base_scheduler.py | 14 ++------------ train.py | 7 ++++++- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/internlm/core/scheduler/base_scheduler.py b/internlm/core/scheduler/base_scheduler.py index 5d3b657..20b4460 100644 --- a/internlm/core/scheduler/base_scheduler.py +++ b/internlm/core/scheduler/base_scheduler.py @@ -158,13 +158,6 @@ class SchedulerMetricHook(SchedulerHook): self._post_func = metric self._skip = skip - if skip: - # init timer only. - timer("fwd") - timer("bwd") - timer("cal_loss") - timer("post_fn") - def before_forward(self, scheduler, inputs) -> None: if not self._skip: timer("fwd").start() @@ -190,8 +183,5 @@ class SchedulerMetricHook(SchedulerHook): timer("bwd").stop() def post_helper_func(self, scheduler, outputs, label) -> None: - if not self._skip: - timer("post_fn").start() - if self._post_func is not None: - self._post_func(outputs, label) - timer("post_fn").stop() + if self._post_func is not None: + self._post_func(outputs, label) diff --git a/train.py b/train.py index 6f56884..d42cb21 100644 --- a/train.py +++ b/train.py @@ -543,7 +543,12 @@ def main(args): scheduler_hooks = [ SchedulerMetricHook( metric=metric, - skip=gpc.is_using_pp() and gpc.config.parallel["pipeline"].get("interleaved_overlap", False), + skip=( + gpc.is_using_pp() + and hasattr(gpc.config.model, "num_chunks") + and gpc.config.model.num_chunks > 1 + and gpc.config.parallel["pipeline"].get("interleaved_overlap", False) + ), ), ] From f5f54465601a9d2d55b960f2a59f16cc88bb9fd5 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Wed, 16 Aug 2023 15:57:26 +0800 Subject: [PATCH 5/8] Merge main to develop (#203) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix/fix_submodule_err (#61) * fix/fix_submodule_err --------- Co-authored-by: ChenQiaoling00 * fix issue templates (#65) * fix(tokenizer): refactor tokenizer and update usage in readme (#51) * update tokenizer example * fix(readme, requirements): fix typo at Chinese readme and select a lower version of transformers (#73) * fix a typo in readme * in order to find InternLMTokenizer, select a lower version of Transformers --------- Co-authored-by: gouhchangjiang * [Doc] Add wechat and discord link in readme (#78) * Doc:add wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * Doc:update wechat and discord link * [Docs]: add Japanese README (#43) * Add Japanese README * Update README-ja-JP.md replace message * Update README-ja-JP.md * add repetition_penalty in GenerationConfig in web_demo.py (#48) Co-authored-by: YWMditto <862779238@qq.com> * use fp16 in instruction (#80) * [Enchancement] add more options for issue template (#77) * [Enchancement] add more options for issue template * update qustion icon * fix link * Use tempfile for convert2hf.py (#23) Fix https://github.com/InternLM/InternLM/issues/50 * delete torch_dtype of README's example code (#100) * set the value of repetition_penalty to 1.0 to avoid random outputs (#99) * Update web_demo.py (#97) Remove meaningless log. * [Fix]Fix wrong string cutoff in the script for sft text tokenizing (#106) * docs(install.md): update dependency package transformers version to >= 4.28.0 (#124) Co-authored-by: 黄婷 * docs(LICENSE): add license (#125) * add license of colossalai and flash-attn * fix lint * modify the name * fix AutoModel map in convert2hf.py (#116) * variables are not printly as expect (#114) * feat(solver): fix code to adapt to torch2.0 and provide docker images (#128) * feat(solver): fix code to adapt to torch2.0 * docs(install.md): publish internlm environment image * docs(install.md): update dependency packages version * docs(install.md): update default image --------- Co-authored-by: 黄婷 * add demo test (#132) Co-authored-by: qa-caif-cicd * fix web_demo cache accelerate (#133) * Doc: add twitter link (#141) * Feat add checkpoint fraction (#151) * feat(config): add checkpoint_fraction into config * feat: remove checkpoint_fraction from configs/7B_sft.py --------- Co-authored-by: wangguoteng.p * [Doc] update deployment guide to keep consistency with lmdeploy (#136) * update deployment guide * fix error * use llm partition (#159) Co-authored-by: qa-caif-cicd * test(ci_scripts): clean test data after test, remove unnecessary global variables, and other optimizations (#165) * test: optimization of ci scripts(variables, test data cleaning, etc). * chore(workflows): disable ci job on push. * fix: update partition * test(ci_scripts): add install requirements automaticlly,trigger event about lint check and other optimizations (#174) * add pull_request in lint check * use default variables in ci_scripts * fix format * check and install requirements automaticlly * fix format --------- Co-authored-by: qa-caif-cicd * feat(profiling): add a simple memory profiler (#89) * feat(profiling): add simple memory profiler * feat(profiling): add profiling argument * feat(CI_workflow): Add PR & Issue auto remove workflow (#184) * feat(ci_workflow): Add PR & Issue auto remove workflow Add a workflow for stale PR & Issue auto remove - pr & issue well be labeled as stale for inactive in 7 days - staled PR & Issue well be remove in 7 days - run this workflow every day on 1:30 a.m. * Update stale.yml * feat(bot): Create .owners.yml for Auto Assign (#176) * Create .owners.yml: for issue/pr assign automatically * Update .owners.yml * Update .owners.yml fix typo * [feat]: add pal reasoning script (#163) * [Feat] Add PAL inference script * Update README.md * Update tools/README.md Co-authored-by: BigDong * Update tools/pal_inference.py Co-authored-by: BigDong * Update pal script * Update README.md * restore .ore-commit-config.yaml * Update tools/README.md Co-authored-by: BigDong * Update tools/README.md Co-authored-by: BigDong * Update pal inference script * Update READMD.md * Update internlm/utils/interface.py Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> * Update pal script * Update pal script * Update script * Add docstring * Update format * Update script * Update script * Update script --------- Co-authored-by: BigDong Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> * test(ci_scripts): add timeout settings and clean work after the slurm job (#185) * restore pr test on develop branch * add mask * add post action to cancel slurm job * remove readonly attribute on job log * add debug info * debug job log * try stdin * use stdin * set default value avoid error * try setting readonly on job log * performance echo * remove debug info * use squeue to check slurm job status * restore the lossed parm * litmit retry times * use exclusive to avoid port already in use * optimize loop body * remove partition * add {} for variables * set env variable for slurm partition --------- Co-authored-by: qa-caif-cicd * refactor(tools): move interface.py and import it to web_demo (#195) * move interface.py and import it to web_demo * typo * fix(ci): fix lint error * fix(ci): fix lint error --------- Co-authored-by: Sun Peng Co-authored-by: ChenQiaoling00 Co-authored-by: Kai Chen Co-authored-by: Yang Gao Co-authored-by: Changjiang GOU Co-authored-by: gouhchangjiang Co-authored-by: vansin Co-authored-by: Ikko Eltociear Ashimine Co-authored-by: YWMditto <46778265+YWMditto@users.noreply.github.com> Co-authored-by: YWMditto <862779238@qq.com> Co-authored-by: WRH <12756472+wangruohui@users.noreply.github.com> Co-authored-by: liukuikun <24622904+Harold-lkk@users.noreply.github.com> Co-authored-by: x54-729 <45304952+x54-729@users.noreply.github.com> Co-authored-by: Shuo Zhang Co-authored-by: Miao Zheng <76149310+MeowZheng@users.noreply.github.com> Co-authored-by: 黄婷 Co-authored-by: ytxiong <45058324+yingtongxiong@users.noreply.github.com> Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Co-authored-by: kkscilife <126147887+kkscilife@users.noreply.github.com> Co-authored-by: qa-caif-cicd Co-authored-by: hw <45089338+MorningForest@users.noreply.github.com> Co-authored-by: Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Co-authored-by: wangguoteng.p Co-authored-by: lvhan028 Co-authored-by: zachtzy <141206206+zachtzy@users.noreply.github.com> Co-authored-by: cx <759046501@qq.com> Co-authored-by: Jaylin Lee <61487970+APX103@users.noreply.github.com> Co-authored-by: del-zhenwu Co-authored-by: Shaoyuan Xie <66255889+Daniel-xsy@users.noreply.github.com> Co-authored-by: BigDong Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> Co-authored-by: huangting4201 --- .github/workflows/demo_in_readme.yaml | 61 +- .github/workflows/lint_check.yaml | 12 +- .github/workflows/stale.yml | 32 + .gitignore | 3 +- .owners.yml | 14 + README-zh-Hans.md | 23 +- README.md | 27 +- ci_scripts/common/basic_func.sh | 26 +- ci_scripts/common/post_action.sh | 21 + ci_scripts/common/variables.sh | 3 + ci_scripts/data/tokenizer_alpaca.sh | 54 +- ci_scripts/data/tokenizer_chinese.sh | 45 +- ci_scripts/model/convert_to_hf.sh | 56 +- ci_scripts/model/demo_load_7B_chat_model.py | 3 +- ci_scripts/model/loaded_as_transformer.py | 2 + ci_scripts/train/slurm_train.sh | 39 +- ci_scripts/train/torchrun.sh | 40 +- configs/7B_sft.py | 2 +- doc/en/usage.md | 2 +- doc/usage.md | 2 +- internlm/initialize/launch.py | 15 +- internlm/model/modeling_internlm.py | 23 +- internlm/model/utils.py | 2 +- .../solver/optimizer/hybrid_zero_optim.py | 2 + internlm/solver/optimizer/utils.py | 2 +- internlm/utils/simple_memory_profiler.py | 670 ++++++++++++++++++ internlm/utils/timeout.py | 26 + requirements/runtime.txt | 1 + tools/README.md | 60 +- tools/README_EN.md | 61 +- tools/pal_inference.py | 320 +++++++++ tools/transformers/interface.py | 137 ++++ train.py | 20 + web_demo.py | 189 +---- 34 files changed, 1691 insertions(+), 304 deletions(-) create mode 100644 .github/workflows/stale.yml create mode 100644 .owners.yml create mode 100644 ci_scripts/common/post_action.sh create mode 100644 ci_scripts/common/variables.sh create mode 100644 internlm/utils/simple_memory_profiler.py create mode 100644 internlm/utils/timeout.py create mode 100644 tools/pal_inference.py create mode 100644 tools/transformers/interface.py diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml index 7dd59c3..ce7a6c3 100644 --- a/.github/workflows/demo_in_readme.yaml +++ b/.github/workflows/demo_in_readme.yaml @@ -7,44 +7,84 @@ on: paths-ignore: - "docs/**" - "**.md" - +env: + WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) + SLURM_PARTITION: llm + jobs: - dataset-preparation: + check-requirements: runs-on: [lmtest] steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + - uses: actions/checkout@v3 + with: + fetch-depth: 2 + - name: check-requirements + run: | + source activate internlm-env-test + changed_files=$(git diff --name-only -r HEAD^1 HEAD) + echo $changed_files + if [[ $changed_files =~ "runtime.txt" ]]; then + pip install -r requirements/runtime.txt + fi + + if [[ $changed_files =~ "torch.txt" ]]; then + pip install -r requirements/torch.txt + fi + + dataset-preparation: + if: ${{ always() }} + needs: check-requirements + runs-on: [lmtest] + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" - uses: actions/checkout@v3 - name: raw-chinese-data run: | source activate internlm-env-test - sh ./ci_scripts/data/tokenizer_chinese.sh + sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} - name: alpaca-data run: | source activate internlm-env-test sh ./ci_scripts/data/tokenizer_alpaca.sh - train: + if: ${{ always() }} + needs: check-requirements runs-on: [lmtest] + timeout-minutes: 30 steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" - uses: actions/checkout@v3 - name: slurm-train run: | source activate internlm-env-test - sh ./ci_scripts/train/slurm_train.sh + sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} rm -rf $GITHUB_WORKSPACE/llm_ckpts - name: torchrun-train run: | source activate internlm-env-test - sh ./ci_scripts/train/torchrun.sh + sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} rm -rf $GITHUB_WORKSPACE/llm_ckpts convert-model-then-load: + if: ${{ always() }} + needs: check-requirements runs-on: [lmtest] steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" - uses: actions/checkout@v3 - name: convert-model-then-load @@ -53,16 +93,21 @@ jobs: export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/model/convert_to_hf.sh cd ./hf_ckpt - srun -p llm2 python ../ci_scripts/model/loaded_as_transformer.py + srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py cd .. rm -rf $GITHUB_WORKSPACE/hf_ckpt load-chat-model-in-hf: + if: ${{ always() }} + needs: check-requirements runs-on: [lmtest] steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" - uses: actions/checkout@v3 - name: chat-model-in-hf run: | source activate internlm-env-test - srun -p llm2 python ./ci_scripts/model/demo_load_7B_chat_model.py + srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py diff --git a/.github/workflows/lint_check.yaml b/.github/workflows/lint_check.yaml index f32d68f..f2f66ce 100644 --- a/.github/workflows/lint_check.yaml +++ b/.github/workflows/lint_check.yaml @@ -1,12 +1,16 @@ name: lint-check -on: [push] +on: + push: + pull_request: + branches: + - "main" + - "develop" jobs: # lint check can be auto-executed by the workflow lint-check: - runs-on: [internlm] - if: github.repository == 'InternLM/InternLM' + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -35,4 +39,4 @@ jobs: pip install pylint==v2.17.2 PYLINT_DISABLE_LIST="C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203" pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./internlm/* - pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./train.py \ No newline at end of file + pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./train.py diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000..1c0cba4 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,32 @@ +name: 'Close stale issues and PRs' + +on: + schedule: + # check issue and pull request once at 01:30 a.m. every day + - cron: '30 1 * * *' + +permissions: + contents: read + +jobs: + stale: + permissions: + issues: write + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v7 + with: + stale-issue-message: 'This issue is marked as stale because it has been marked as invalid or awaiting response for 7 days without any further response. It will be closed in 7 days if the stale label is not removed or if there is no further response.' + stale-pr-message: 'This PR is marked as stale because there has been no activity in the past 7 days. It will be closed in 7 days if the stale label is not removed or if there is no further updates.' + close-issue-message: 'This issue is closed because it has been stale for 7 days. Please open a new issue if you have similar issues or you have any new updates now.' + close-pr-message: 'This PR is closed because it has been stale for 7 days. Please reopen this PR if you have any updates and want to keep contributing the code.' + # Labels on issues exempted from stale + exempt-issue-labels: "enhancement,planned" + days-before-issue-stale: 7 + days-before-pr-stale: 7 + days-before-issue-close: 7 + days-before-pr-close: 7 + # automatically remove the stale label when the issues or the pull requests are updated or commented + remove-stale-when-updated: true + operations-per-run: 50 diff --git a/.gitignore b/.gitignore index ca9ea1a..8992a0f 100644 --- a/.gitignore +++ b/.gitignore @@ -143,4 +143,5 @@ core.* # Run llm_ckpts -events.* \ No newline at end of file +events.* +memory_trace diff --git a/.owners.yml b/.owners.yml new file mode 100644 index 0000000..1ae4c19 --- /dev/null +++ b/.owners.yml @@ -0,0 +1,14 @@ +assign: + strategy: + # random + daily-shift-based + schedule: + '*/1 * * * *' + assignees: + - yhcc + - yhcc + - sunpengsdu + - sunpengsdu + - ZwwWayne + - ZwwWayne + - yhcc diff --git a/README-zh-Hans.md b/README-zh-Hans.md index 2281d7d..ad19550 100644 --- a/README-zh-Hans.md +++ b/README-zh-Hans.md @@ -32,7 +32,7 @@

- 👋 加入我们的 Discord微信社区 + 👋 加入我们的推特Discord微信社区

## 简介 @@ -119,21 +119,22 @@ streamlit run web_demo.py 1. 首先安装 LMDeploy: - ``` - python3 -m pip install lmdeploy - ``` + ```bash + python3 -m pip install lmdeploy + ``` 2. 快速的部署命令如下: - ``` - python3 -m lmdeploy.serve.turbomind.deploy InternLM-7B /path/to/internlm-7b/model hf - ``` + ```bash + python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-7b/model + ``` -3. 在导出模型后,你可以直接通过如下命令启动服务一个服务并和部署后的模型对话 +3. 在导出模型后,你可以直接通过如下命令启动服务,并在客户端与AI对话 - ``` - python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 - ``` + ```bash + bash workspace/service_docker_up.sh + python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 + ``` [LMDeploy](https://github.com/InternLM/LMDeploy) 支持了 InternLM 部署的完整流程,请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。 diff --git a/README.md b/README.md index c66a0ae..8a0277b 100644 --- a/README.md +++ b/README.md @@ -32,13 +32,9 @@

- 👋 join us on Discord and WeChat + 👋 join us on Twitter, Discord and WeChat

- - - - ## Introduction InternLM has open-sourced a 7 billion parameter base model and a chat model tailored for practical scenarios. The model has the following characteristics: @@ -126,21 +122,22 @@ We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the one-clic 1. First, install LMDeploy: -``` - python3 -m pip install lmdeploy -``` + ```bash + python3 -m pip install lmdeploy + ``` 2. Use the following command for quick deployment: -``` - python3 -m lmdeploy.serve.turbomind.deploy InternLM-7B /path/to/internlm-7b/model hf -``` + ```bash + python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b/model + ``` 3. After exporting the model, you can start a server and have a conversation with the deployed model using the following command: - -``` - python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 -``` + + ```bash + bash workspace/service_docker_up.sh + python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 + ``` [LMDeploy](https://github.com/InternLM/LMDeploy) provides a complete workflow for deploying InternLM. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM. diff --git a/ci_scripts/common/basic_func.sh b/ci_scripts/common/basic_func.sh index f9bb196..8ce1c54 100644 --- a/ci_scripts/common/basic_func.sh +++ b/ci_scripts/common/basic_func.sh @@ -1,14 +1,18 @@ #!/bin/bash -export exit_code=0 - -function if_exist() { -ls -l $file_path -exit_code_now=$? -exit_code=$(($exit_code + $exit_code_now)) -} - -function num_files() { -file_num=$(ls -l $file_dir |wc -l) -echo "there are $file_num files in $file_dir" +####################################### +# Calculate the number of files in a directory. +# Call this function like this: num_files "${file_path}". +# Globals: +# None +# Arguments: +# $1: the directory path +# Returns: +# the number of files in the directory +####################################### +num_files() { + [[ $# -eq 1 ]] || return 1 + local file_num + file_num=$(ls -l $1 | grep '^-' | wc -l) + echo $file_num } diff --git a/ci_scripts/common/post_action.sh b/ci_scripts/common/post_action.sh new file mode 100644 index 0000000..9aa4d22 --- /dev/null +++ b/ci_scripts/common/post_action.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -x + +retry_times=3 +for ((i=1;i<=$retry_times;i++));do + jobid=$(squeue -o "%A %j" -u $USER | grep ${GITHUB_RUN_ID}-${GITHUB_JOB} | awk '{print $1}') + if [[ -n "$jobid" ]];then + echo "The job $jobid will be canceled." + scancel $jobid + sleep 0.5 + else + echo "There are no more jobs that need to be canceled." + break + fi +done + +if [[ $i -gt $retry_times ]];then + echo "There have been tried $retry_times times. Please contact user $USER to confirm the job status." +fi + +exit 0 diff --git a/ci_scripts/common/variables.sh b/ci_scripts/common/variables.sh new file mode 100644 index 0000000..cc1b0e0 --- /dev/null +++ b/ci_scripts/common/variables.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +readonly DATA_VOLUME=$(echo $GITHUB_WORKSPACE | cut -d '/' -f 1-4)/data diff --git a/ci_scripts/data/tokenizer_alpaca.sh b/ci_scripts/data/tokenizer_alpaca.sh index e8ccac7..0d06455 100644 --- a/ci_scripts/data/tokenizer_alpaca.sh +++ b/ci_scripts/data/tokenizer_alpaca.sh @@ -1,22 +1,50 @@ #!/bin/bash +set -x -rm -rf /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/* +source ./ci_scripts/common/variables.sh +[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } -python tools/alpaca_tokenizer.py /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/alpaca_data.json /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result tools/V7_sft.model --split_ratio 0.1 +readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json +readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result +readonly TRAIN_DATASET=${RESULTS}/train/en/dataset.bin +readonly TRAIN_DATASET_META=${RESULTS}/train/en/dataset.bin.meta +readonly VALID_DATASET=${RESULTS}/valid/en/dataset.bin +readonly VALID_DATASET_META=${RESULTS}/valid/en/dataset.bin.meta -file_one="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/train/en/dataset.bin" -file_two="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/train/en/dataset.bin.meta" -file_three="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/valid/en/dataset.bin" -file_four="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/valid/en/dataset.bin.meta" -file_list=($file_one $file_two $file_three $file_four) +split_ratio=0.1 +exit_code=0 source ./ci_scripts/common/basic_func.sh -for file_path in ${file_list[@]}; -do -if_exist $file_path + +echo "start to test alpaca_tokenizer.py." + +if [[ -d ${RESULTS} ]]; then + if ! rm -rf ${RESULTS}/*; then + echo "cleaning test data in ${RESULTS} failed, exit." + exit 1 + fi +fi + +if [[ ! -f ${SRC_DATASET_META} ]]; then + echo "${SRC_DATASET_META} should be exist, exit." + exit 1 +fi + +python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio} +[[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed."; exit_code=$(($exit_code + 1)); } + +file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META}) +for file in ${file_list[@]}; do + if [[ ! -f ${file} ]]; then + echo "expect: ${file} exists, actual: not exist." + exit_code=$(($exit_code + 1)) + fi done -if [ $exit_code -ne 0 ] -then - exit 1 +# clean the test files. +if ! rm -rf ${RESULTS}/*; then + echo "cleaning test data in ${RESULTS} failed." + exit_code=$(($exit_code + 1)) fi + +exit $exit_code diff --git a/ci_scripts/data/tokenizer_chinese.sh b/ci_scripts/data/tokenizer_chinese.sh index 6904b3c..ce76530 100644 --- a/ci_scripts/data/tokenizer_chinese.sh +++ b/ci_scripts/data/tokenizer_chinese.sh @@ -1,19 +1,42 @@ #!/bin/bash +set -x -rm -rf /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.* -srun -p llm2 python tools/tokenizer.py --text_input_path /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/raw_data.txt --bin_output_path /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin +source ./ci_scripts/common/variables.sh +[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci."; exit 1; } -file_one="/mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin" -file_two="/mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin.meta" -file_list=($file_one $file_two) +readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt +readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin +readonly RESULT_META=${DATA_VOLUME}/lm_data/cn_data/result.bin.meta +readonly RESULTS=${DATA_VOLUME}/lm_data/cn_data/result.* +exit_code=0 source ./ci_scripts/common/basic_func.sh -for file_path in ${file_list[@]}; -do -if_exist $file_path + +echo "start to test tokenizer.py." + +num=$(num_files "${RESULTS}") +if [[ ${num} -gt 0 ]]; then + if ! rm -rf ${RESULTS}; then + echo "cleaning test data ${RESULTS} failed, exit." + exit 1 + fi +fi + +srun -p ${SLURM_PARTITION} --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} +[[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); } + +file_list=($RESULT $RESULT_META) +for file in ${file_list[@]}; do + if [[ ! -f ${file} ]]; then + echo "expect: ${file} exists, actual: not exist." + exit_code=$(($exit_code + 1)) + fi done -if [ $exit_code -ne 0 ] -then - exit 1 +# clean the test files. +if ! rm -rf ${RESULTS}/*; then + echo "cleaning cached file in ${RESULTS} failed." + exit_code=$(($exit_code + 1)) fi + +exit $exit_code diff --git a/ci_scripts/model/convert_to_hf.sh b/ci_scripts/model/convert_to_hf.sh index 385bba5..162946d 100644 --- a/ci_scripts/model/convert_to_hf.sh +++ b/ci_scripts/model/convert_to_hf.sh @@ -1,33 +1,47 @@ #!/bin/bash +set -x -rm -rf ./hf_ckpt/* -python ./tools/transformers/convert2hf.py --src_folder /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/llm_ckpts/20 --tgt_folder hf_ckpt/ --tokenizer ./tools/V7_sft.model +source ./ci_scripts/common/variables.sh +[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } +[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } -#assert exists model -file_one="$GITHUB_WORKSPACE/hf_ckpt/tokenizer.model" -file_two="$GITHUB_WORKSPACE/hf_ckpt/config.json" -file_three="$GITHUB_WORKSPACE/hf_ckpt/modeling_internlm.py" -file_list=($file_one $file_two $file_three) -file_dir="$GITHUB_WORKSPACE/hf_ckpt/*" +readonly CKPTS_INPUT="${DATA_VOLUME}/lm_data/alpaca_data/llm_ckpts/20" +readonly CKPTS_OUTPUT="${GITHUB_WORKSPACE}/hf_ckpt" +readonly TOKENIZER="${GITHUB_WORKSPACE}/hf_ckpt/tokenizer.model" +readonly CONFIG="${GITHUB_WORKSPACE}/hf_ckpt/config.json" +readonly INERNLM="${GITHUB_WORKSPACE}/hf_ckpt/modeling_internlm.py" +exit_code=0 +expected_num=9 source ./ci_scripts/common/basic_func.sh -for file_path in ${file_list[@]}; -do -if_exist $file_path +echo "start to test convert2hf.py." + +if [[ -d ${CKPTS_OUTPUT} ]]; then + if ! rm -rf ${CKPTS_OUTPUT}/*; then + echo "cleaning cached file in ${CKPTS_OUTPUT} failed, exit." + exit 1 + fi +fi + +python ./tools/transformers/convert2hf.py --src_folder ${CKPTS_INPUT} --tgt_folder ${CKPTS_OUTPUT} --tokenizer ./tools/V7_sft.model +[[ $? -ne 0 ]] && { echo "test convert2hf.py failed."; exit_code=$(($exit_code + 1)); } + +#assert exists model +file_list=($TOKENIZER $CONFIG $INERNLM) +for file in ${file_list[@]}; do + if [[ ! -f ${file} ]];then + echo "file ${file} does not exist." + exit_code=$(($exit_code + 1)) + fi done +num=$(num_files "${CKPTS_OUTPUT}") -num_files ${file_dir} - -if [ $file_num -ne 9 ] -then - echo "The num of files is not right" - ls -l $file_dir +if [[ ${num} -ne ${expected_num} ]]; then + echo "expect: ${expected_num} files, actual: ${num} files." exit_code=$(($exit_code + 1)) fi -if [ $exit_code -ne 0 ] -then - exit 1 -fi +# NOTICE: should not remove the cached files, because the cached files will be used in the next test case. +exit $exit_code diff --git a/ci_scripts/model/demo_load_7B_chat_model.py b/ci_scripts/model/demo_load_7B_chat_model.py index 61cec0d..695be27 100644 --- a/ci_scripts/model/demo_load_7B_chat_model.py +++ b/ci_scripts/model/demo_load_7B_chat_model.py @@ -1,4 +1,5 @@ - +#!/usr/bin/env python +# -*- encoding: utf-8 -*- from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True) diff --git a/ci_scripts/model/loaded_as_transformer.py b/ci_scripts/model/loaded_as_transformer.py index 5e3d28d..5254fb9 100644 --- a/ci_scripts/model/loaded_as_transformer.py +++ b/ci_scripts/model/loaded_as_transformer.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- from transformers import AutoModel model = AutoModel.from_pretrained("../hf_ckpt/", trust_remote_code=True).cuda() diff --git a/ci_scripts/train/slurm_train.sh b/ci_scripts/train/slurm_train.sh index 99cb0e8..0097bd9 100644 --- a/ci_scripts/train/slurm_train.sh +++ b/ci_scripts/train/slurm_train.sh @@ -1,20 +1,37 @@ #!/bin/bash +set -x -rm -rf $GITHUB_WORKSPACE/llm_ckpts/20 +[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" +readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" +readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt" +expected_num=21 +exit_code=0 -srun -p llm2 --quotatype=spot -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py - -file_dir="$GITHUB_WORKSPACE/llm_ckpts/20/*.pt" source ./ci_scripts/common/basic_func.sh -num_files ${file_dir} +echo "start to test slurm training." -if [ $file_num -ne 21 ] -then - echo "The num of files is not right" - ls -l $file_dir - rm -rf $GITHUB_WORKSPACE/llm_ckpts - exit 1 +if [[ -d ${CKPTS20_PATH} ]]; then + if ! rm -rf ${CKPTS20_PATH}/*; then + echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." + exit 1 + fi fi +srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py +[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } +num=$(num_files "${CKPTS20_OUTPUT}") +if [[ ${num} -ne ${expected_num} ]]; then + echo "expect: ${expected_num} files, actual: ${num} files." + exit_code=$(($exit_code + 1)) +fi + +# clean the test files. +if ! rm -rf ${CKPTS_PATH}/*; then + echo "cleaning cached file in ${CKPTS_PATH} failed." + exit_code=$(($exit_code + 1)) +fi + +exit $exit_code diff --git a/ci_scripts/train/torchrun.sh b/ci_scripts/train/torchrun.sh index 06e9e55..629a3e5 100644 --- a/ci_scripts/train/torchrun.sh +++ b/ci_scripts/train/torchrun.sh @@ -1,17 +1,37 @@ #!/bin/bash +set -x -rm -rf $GITHUB_WORKSPACE/llm_ckpts/20 -srun -p llm2 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher "torch" +[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" +readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" +readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt" +expected_num=21 +exit_code=0 -file_dir="$GITHUB_WORKSPACE/llm_ckpts/20/*.pt" source ./ci_scripts/common/basic_func.sh -num_files ${file_dir} +echo "start to test torch training." -if [ $file_num -ne 21 ] -then - echo "The num of files is not right" - ls -l $file_dir - rm -rf $GITHUB_WORKSPACE/llm_ckpts - exit 1 +if [[ -d ${CKPTS20_PATH} ]]; then + if ! rm -rf ${CKPTS20_PATH}/*; then + echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." + exit 1 + fi fi + +srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch +[[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); } + +num=$(num_files "${CKPTS_OUTPUT}") +if [[ ${num} -ne ${expected_num} ]]; then + echo "expect: ${expected_num} files, actual: ${num} files." + exit_code=$(($exit_code + 1)) +fi + +# clean the test files. +if ! rm -rf ${CKPTS_PATH}/*; then + echo "cleaning cached file in ${CKPTS_PATH} failed." + exit_code=$(($exit_code + 1)) +fi + +exit $exit_code diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 32f7c5f..bf7472e 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -110,7 +110,7 @@ beta2_scheduler = dict( ) model = dict( - checkpoint=False, + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] num_attention_heads=NUM_ATTENTION_HEAD, embed_split_hidden=True, vocab_size=VOCAB_SIZE, diff --git a/doc/en/usage.md b/doc/en/usage.md index 474f0bf..0f62ebc 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -140,7 +140,7 @@ HIDDEN_SIZE = 4096 NUM_LAYER = 32 MLP_RATIO = 8 / 3 model = dict( - checkpoint=False, + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] num_attention_heads=NUM_ATTENTION_HEAD, embed_split_hidden=True, vocab_size=VOCAB_SIZE, diff --git a/doc/usage.md b/doc/usage.md index 3cc6f10..8c9a455 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -126,7 +126,7 @@ HIDDEN_SIZE = 4096 NUM_LAYER = 32 MLP_RATIO = 8 / 3 model = dict( - checkpoint=False, + checkpoint=False, # 进行重计算的模型层数比例,可选值为 True/False/[0-1] num_attention_heads=NUM_ATTENTION_HEAD, embed_split_hidden=True, vocab_size=VOCAB_SIZE, diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index a2bc833..986d1f7 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -181,9 +181,10 @@ def args_sanity_check(): logger.info(f"cudnn.deterministic: {torch.backends.cudnn.deterministic }") logger.info(f"clip_grad_norm: {clip_grad_norm}") - if "dtype" not in gpc.config.model: + model = gpc.config.model + if "dtype" not in model: logger.warning("dtype is not set, use torch.float16 by defalut!") - gpc.config.model._add_item("dtype", torch.float16) + model._add_item("dtype", torch.float16) else: if gpc.config.model.dtype == "torch.bfloat16": gpc.config.model.dtype = torch.bfloat16 @@ -206,6 +207,16 @@ def args_sanity_check(): "torch.tf32", ] + if "checkpoint" in model: + if model.checkpoint is True: + model.checkpoint = 1 + elif model.checkpoint is False: + model.checkpoint = 0 + else: + assert ( + model.checkpoint >= 0 and model.checkpoint <= 1 + ), f'model.checkpoint: "{model.checkpoint}" should >=0 and <=1' + if gpc.is_rank_for_log(): logger.info("+" * 15 + " Model Info " + "+" * 15) # pylint: disable=W1201 logger.info(f"Model: {gpc.config.model}") diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 4a7a4ee..404a078 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -232,9 +232,8 @@ class PackedFlashInternLm1D(nn.Module): attn_drop_rate (float): The dropout rate of attention module. 0.0 by default. drop_rate (float): The dropout rate of input hidden state. 0.0 by default. dtype (torch.dtype): The type of data. torch.float by default. - checkpoint (bool): Whether to use checkpointing to save VRAM. True by default. - checkpoint_fraction (float): The proportion of layers that need to be checkpointed compared to the total number - of layers. 1.0 by default. + checkpoint (float): The proportion of layers that need to be checkpointed compared to the total number + of layers. 0.0 by default. layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-6 by default. first (bool): Whether input embedding layer or not. False by default. last (bool): Whether output embedding layer or not. False by default. @@ -260,8 +259,7 @@ class PackedFlashInternLm1D(nn.Module): attn_drop_rate: float = 0.0, drop_rate: float = 0.0, dtype: torch.dtype = torch.float, - checkpoint: bool = False, - checkpoint_fraction: float = 1.0, + checkpoint: float = 0.0, layer_norm_epsilon: float = 1e-5, first: bool = False, last: bool = False, @@ -280,12 +278,8 @@ class PackedFlashInternLm1D(nn.Module): ): super().__init__() - self.use_flash_attn = use_flash_attn - if checkpoint_fraction <= 0: - checkpoint = False - if not checkpoint: - checkpoint_fraction = 0 - checkpoint_layer_num = num_layers * checkpoint_fraction + checkpoint_layer_num = int(num_layers * checkpoint) + if is_reward: head_cls = RewardModelLinear else: @@ -414,11 +408,6 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), models = [] - if kwargs["checkpoint"] is True: - kwargs["checkpoint_fraction"] = 1.0 - else: - kwargs["checkpoint_fraction"] = 0 - for start, end in parts: kwargs["num_layers"] = end - start kwargs["first"] = start == 0 @@ -441,7 +430,7 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), @MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE) def build_model_with_cfg( num_chunks=1, - checkpoint=False, + checkpoint=0.0, dtype=torch.float, embed_split_hidden=False, num_layers=48, diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 18845f0..12f80e3 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -203,7 +203,7 @@ def try_import_RMSNorm(): return RMSNorm except ModuleNotFoundError: - logger.warn("The torch implementation for MixFusedRMSNorm is slower than apex. Please note this!") + logger.warning("The torch implementation for MixFusedRMSNorm is slower than apex. Please note this!") from internlm.model.norm import RMSNormTorch as RMSNorm return RMSNorm diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 655d06c..55fad5f 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +import math from functools import partial import torch @@ -32,6 +33,7 @@ from internlm.utils.megatron_timers import megatron_timer as timer from .utils import compute_norm +inf = math.inf logger = get_logger(__file__) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index 5174cb5..5a752ef 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -24,7 +24,7 @@ try: APEX_AVAILABLE = True except (ModuleNotFoundError, ImportError): - logger.warn("The torch implementation for cal_l2norm is slower than apex. Please note this!") + logger.warning("The torch implementation for cal_l2norm is slower than apex. Please note this!") APEX_AVAILABLE = False inf = math.inf diff --git a/internlm/utils/simple_memory_profiler.py b/internlm/utils/simple_memory_profiler.py new file mode 100644 index 0000000..4ca6679 --- /dev/null +++ b/internlm/utils/simple_memory_profiler.py @@ -0,0 +1,670 @@ +import os +import time +from collections import OrderedDict +from functools import partial +from typing import Any, Dict, List, Tuple + +import pyecharts +import torch + +from internlm.core.context import ParallelMode +from internlm.core.context import global_context as gpc +from internlm.solver.pipeline_utils import partition_uniform + +mb = 1024 * 1024 + + +class SimpleMemState: + """ + A class to represent the memory state of a model layer. + + Args: + layer_name (str): The name of the layer. + layer_mem (int): The memory usage of the layer in bytes. + """ + + def __init__(self, layer_name: str, layer_mem: int = 0) -> None: + self.layer_name = layer_name + + # Memory status of the current model layer. + self._layer_mem: int = layer_mem + # Total memory status of the model and sub-models, initialized with layer memory. + self._total_mem: int = self._layer_mem + # SimpleMemState of sub-models. + self.sub_model_stats = OrderedDict() + + @property + def layer_mem(self) -> int: + """ + Get the memory usage of the layer. + + Returns: + int: The memory usage of the layer in bytes. + """ + return self._layer_mem + + @layer_mem.setter + def layer_mem(self, new_layer_mem: int) -> None: + """ + Set the memory usage of the layer. + + Args: + new_layer_mem (int): The new memory usage of the layer in bytes. + """ + diff = new_layer_mem - self._layer_mem + self._layer_mem = new_layer_mem + self._total_mem += diff + + @property + def total_mem(self) -> int: + """ + Get the total memory usage of the model and sub-models. + + Returns: + int: The total memory usage in bytes. + """ + return self._total_mem + + def add(self, layer_name: str, layer_mem: int = 0, flush: bool = True) -> None: + """ + Add a layer to the memory state. + + Args: + layer_name (str): The name of the layer. + layer_mem (int, optional): The memory usage of the layer in bytes. Defaults to 0. + flush (bool, optional): Whether to update the total memory usage. Defaults to True. + """ + path = layer_name.split(".") + + target = self.find_layer_state(path, create=True) + target.layer_mem = layer_mem + + if flush: + self.update_total_memory() + + def delete(self, layer_name: str, flush: bool = True) -> None: + """ + Delete a layer from the memory state. + + Args: + layer_name (str): The name of the layer. + flush (bool, optional): Whether to update the total memory usage. Defaults to True. + """ + path = layer_name.split(".") + assert len(path) >= 2, f"Only support deleting non-root layers, layer_name: {layer_name}" + + parent_path = path[0:-1] + layer = path[-1] + parent = self.find_layer_state(parent_path) + + if parent is not None and layer in parent.sub_model_stats: + del parent.sub_model_stats[layer] + + if flush: + self.update_total_memory() + + def update_total_memory(self) -> None: + """ + Update the total memory usage of the model and sub-models. + """ + for stat in self.sub_model_stats.values(): + # Update sub-model status first. + stat.update_total_memory() + # Add sub-model total_mem to model total_mem. + self._total_mem += stat._total_mem + + def find_layer_state(self, path: Tuple[str], create: bool = False) -> "SimpleMemState": + """ + Find the memory state of a layer. + + Args: + path (Tuple[str]): The path to the layer. + create (bool, optional): Whether to create the layer if it doesn't exist. Defaults to False. + + Returns: + SimpleMemState: The memory state of the layer. + """ + current_node = self + + for _node in path: + if _node not in current_node.sub_model_stats: + if not create: + return None + # Create a layer node. + current_node.sub_model_stats[_node] = SimpleMemState(_node) + + current_node = current_node.sub_model_stats[_node] + + return current_node + + def dump(self, prefix: str = "") -> str: + """ + Dump the memory state of the model and sub-models. + + Args: + prefix (str, optional): The prefix to add to the layer names. Defaults to "". + + Returns: + str: The memory state information. + """ + cur_prefix = prefix + "." + self.layer_name if prefix != "" else self.layer_name + res = f"layer: {cur_prefix}, layer_mem: {self.layer_mem / mb:.2f} MB, total_mem: {self.total_mem / mb:.2f} MB\n" + + for sub_layer in self.sub_model_stats.values(): + res += sub_layer.dump(cur_prefix) + + return res + + def to_json(self, base: int = 1024 * 1024) -> dict: + """ + Convert the memory state to a JSON structure. + + Returns: + dict: The JSON structure of the memory state. + """ + children = [child.to_json() for child in self.sub_model_stats.values()] + if len(children) == 0: + return {"name": self.layer_name, "value": self.layer_mem // base} + else: + return {"name": self.layer_name, "children": children} + + +class SimpleMemoryProfiler: + """ + A memory profiler for a llm model. + + Args: + model (torch.nn.Module): The model to profile. + optimizer (torch.optim.Optimizer): The optimizer used for training the model. + log_file (str): The file to write the memory state information to. + activation_config (List[str], optional): The list of activation layers to track. Defaults to None. + """ + + def __init__( + self, + model: torch.nn.Module, + optimizer: torch.optim.Optimizer, + log_folder: str, + total_steps: int = 5, + activation_config: List[str] = None, + ): + self._model = model + self._optimizer = optimizer + self._log_folder = log_folder + self._remaining_steps = total_steps + + self._stoped = False + self._record_start_time = time.time() + + # For activation memory state. + self._activation_config = activation_config + self._activation_mem_inited: bool = False + self._activation_mem: int = 0 + self._activation_max_count = 0 + self._activation_base_mem: SimpleMemState = SimpleMemState("activations") + + # Check or create log folder + os.makedirs(self._log_folder, exist_ok=True) + + # Register activation memory tracking hooks + self._register_activation_trace_hooks() + + # Calculate static parameter cuda memory + self._param_mem_state = SimpleMemState("param_mem") + self._calc_tensor_memory(self._param_mem_state, self._model.named_parameters()) + # Calculate static grad cuda memory + self._grad_mem_state = SimpleMemState("grad_mem") + self._calc_tensor_memory(self._grad_mem_state, self._model.named_parameters(), True) + # Calculate static optimizer state cuda memory + self._os_params_mem_state = SimpleMemState("os_params_mem") + self._os_state_mem_state = SimpleMemState("os_state_mem") + self._calc_tensor_group_memory(self._os_params_mem_state, list(enumerate(self._optimizer.param_groups))) + + # Generate the first memory record + self.point(create=True) + + def point(self, with_options: str = "", create: bool = False) -> None: + """ + Record the memory state. + + Args: + with_options (str, optional): The options to include in the memory state. Defaults to "". + create (bool, optional): Whether to create a new memory record file. Defaults to False. + + Returns: + None + """ + now = time.time() + file = f"{self._log_folder}/memory.log" + + if with_options == "all": + options = ["params", "grads", "os_params", "os_state", "activation_base"] + else: + options = with_options.split(",") + + total_mem = ( + self._param_mem_state.total_mem + + self._grad_mem_state.total_mem + + self._os_params_mem_state.total_mem + + self._os_state_mem_state.total_mem + + self._activation_mem + ) / mb + + # Generate summary information for memory state + summary_info = ( + f"total_memory: {total_mem:.2f} MB" + + "\n" + + f"params_memory: {self._param_mem_state.total_mem / mb:.2f} MB, " + + f"grads_memory: {self._grad_mem_state.total_mem / mb:.2f} MB, " + + f"os_params_memory: {self._os_params_mem_state.total_mem / mb:.2f} MB, " + + f"os_state_memory: {self._os_state_mem_state.total_mem / mb:.2f} MB, " + + f"activation_memory: {self._activation_mem / mb:.2f} MB" + ) + + # Generate layout information based on selected options + layout_info = "" + if "params" in options: + layout_info += "params_layout:\n" + self._param_mem_state.dump() + if "grads" in options: + layout_info += "grads_layout:\n" + self._grad_mem_state.dump() + if "os_params" in options: + layout_info += "os_params_layout:\n" + self._os_params_mem_state.dump() + if "os_state" in options: + layout_info += "os_state_layout:\n" + self._os_state_mem_state.dump() + if "activation_base" in options: + layout_info += "activation_base_layout:\n" + self._activation_base_mem.dump() + + # Write memory state information to log file + file_mode = "w" if create else "a" + with open(file, file_mode, encoding="utf-8") as writer: + writer.write( + "Memory State:\n" + f"time: {now - self._record_start_time}\n" + "---summary---\n" + summary_info + "\n" + ) + if layout_info != "": + writer.write("---Layout---\n" + layout_info) + writer.write("\n") + + def step(self) -> None: + """ + Update the memory state of the optimizer state. + + Returns: + None + """ + if self._stoped: + return + + self._remaining_steps -= 1 + if self._remaining_steps == 0: + self._stoped = True + + # Update os state memory usage + self._os_state_mem_state = SimpleMemState("os_state_mem") + self._calc_tensor_group_memory(self._os_state_mem_state, list(self._optimizer.state_dict()["state"].items())) + + if not self._stoped: + # Do we need to print os_state_layout every time? Is it always constant? + self.point(with_options="os_state") + else: + # Dump memory layout + self.point(with_options="all") + # Generate sunburst charts + self._render_sunburst_chart(self._param_mem_state.to_json()["children"], "params_memory_sunburst") + self._render_sunburst_chart(self._grad_mem_state.to_json()["children"], "grads_memory_sunburst") + self._render_sunburst_chart( + [self._os_params_mem_state.to_json(), self._os_state_mem_state.to_json()], + "os_memory_sunburst", + ) + self._render_sunburst_chart(self._activation_base_mem.to_json()["children"], "activation_memory_sunburst") + # Generate summary sunburst chart + summary_sunburst_data = [ + {"name": "params", "value": self._param_mem_state.total_mem // mb}, + {"name": "grads", "value": self._grad_mem_state.total_mem // mb}, + {"name": "os_params", "value": self._os_params_mem_state.total_mem // mb}, + {"name": "os_state", "value": self._os_state_mem_state.total_mem // mb}, + {"name": "activation", "value": self._activation_base_mem.total_mem // mb}, + ] + + self._render_sunburst_chart(summary_sunburst_data, "summary_sunburst") + + def _render_sunburst_chart(self, data: Any, name: str) -> None: + pyecharts.charts.Sunburst(init_opts=pyecharts.options.InitOpts(width="1000px", height="1000px")).add( + name, + data_pair=data, + highlight_policy="ancestor", + radius=[0, "95%"], + levels=[ + {}, + { + "r0": "10%", + "r": "40%", + "itemStyle": {"borderWidth": 3}, + "label": {"align": "left"}, + }, + {"r0": "40%", "r": "65%", "label": {"align": "left"}}, + {"r0": "65%", "r": "80%", "label": {"align": "left"}}, + {"r0": "80%", "r": "90%", "label": {"align": "left"}}, + { + "r0": "90%", + "r": "92%", + "label": {"position": "outside", "padding": 3, "silent": False}, + "itemStyle": {"borderWidth": 3}, + }, + ], + ).set_global_opts(title_opts=pyecharts.options.TitleOpts(title="CUDA Memory")).set_series_opts( + label_opts=pyecharts.options.LabelOpts(formatter="{b}") + ).render( + f"{self._log_folder}/{name}.html" + ) + + def _inner_activation_trace_hook(self, layer_name: str, model: Any, inputs: Any, output: torch.Tensor) -> None: + """ + Hook function to trace the activation memory usage for a inner layer. + + Args: + layer_name (str): The name of the layer. + model (Any): The model. + inputs (Any): The inputs to the layer. + output (torch.Tensor): The output tensor. + + Returns: + None + """ + del model, inputs + assert isinstance(output, torch.Tensor), f"Invalid output type: {type(output)}" + + if self._stoped or self._activation_mem_inited: + return + + # Delay updating the total_mem of activation_base_mem here, it will be handled in the forward ending hook. + self._activation_base_mem.add(layer_name, output.element_size() * output.nelement(), flush=False) + + def _activation_trace_hook_forward(self, model: Any, inputs: Any, output: torch.Tensor) -> None: + """ + Hook function to trace the activation memory usage for a forward pass. + + Args: + model (Any): The model. + inputs (Any): The inputs to the model. + output (torch.Tensor): The output tensor. + + Returns: + None + """ + del model, inputs + assert isinstance(output, torch.Tensor), f"invalid output type: {type(output)}" + + if self._stoped: + return + + # Check if the activation memory has been initialized + if self._activation_mem_inited is False: + # Update the total memory of the activation base memory state + self._activation_base_mem.update_total_memory() + # Set with_options to "activation_base" to include activation_base_layout in the memory dump + self._activation_mem_inited = True + + # Accumulate activation memory usage for each forward pass + self._activation_mem += self._activation_base_mem.total_mem + + # Update activation max count + if self._activation_mem // self._activation_base_mem.total_mem > self._activation_max_count: + self._activation_max_count = self._activation_mem // self._activation_base_mem.total_mem + + # Trigger a memory record + self.point() + + def _activation_tarce_hook_backward(self, model: Any, inputs: Any, grad_outputs: Any) -> None: + """ + Hook function to trace the activation memory usage for a backward pass. + + Args: + model (Any): The model. + inputs (Any): The inputs to the model. + grad_outputs (Any): The gradients of the outputs. + + Returns: + None + """ + del model, inputs, grad_outputs + + if self._stoped: + return + + # Release activation memory usage for each backward pass + self._activation_mem -= self._activation_base_mem.total_mem + + # Trigger a memory record + self.point() + + def _register_activation_trace_hooks(self) -> None: + """ + Register activation trace hooks for the model and each submodule in the model. + """ + + # Register inner activation trace hooks for each submodule in the model + for layer_name in self._activation_config: + # Register a hook for every activation + model = self._model + sub_models = layer_name.split(".") + # Get the target sub-model + for sub_model_name in sub_models: + try: + model = model.get_submodule(sub_model_name) + except AttributeError: + model = None + break + + # Register the hook + if model is not None: + model.register_forward_hook(partial(self._inner_activation_trace_hook, layer_name)) + + # Register a forward hook for the main model to track activation memory usage + self._model.register_forward_hook(self._activation_trace_hook_forward) + # Register a backward hook for the main model to release activation memory usage + self._model.register_full_backward_hook(self._activation_tarce_hook_backward) + + def _calc_tensor_memory( + self, root_stat: SimpleMemState, named_tensors: Dict[str, torch.Tensor], require_grad: bool = False + ) -> None: + """ + Calculate the memory usage of tensors and update the memory state. + + Args: + root_stat (SimpleMemState): The root memory state. + named_tensors (Dict[str, torch.Tensor]): A dictionary containing the named tensors. + require_grad (bool, optional): Whether to consider tensors with gradients. Defaults to False. + + Returns: + None + """ + for name, tensor in named_tensors: + if require_grad and not tensor.requires_grad: + continue + + layer_splits = name.split(sep=".") + layer_stat = root_stat.find_layer_state(layer_splits, create=True) + layer_stat.layer_mem = tensor.element_size() * tensor.nelement() + + root_stat.update_total_memory() + + def _calc_tensor_group_memory(self, root_stat: SimpleMemState, tensor_groups: List[Tuple[int, torch.Tensor]]): + """ + Calculate the memory usage of a group of tensors. + + Args: + root_stat (SimpleMemState): The root memory state. + tensor_groups (List[Tuple[int, torch.Tensor]]): A list of tuples containing the tensor groups. + + Returns: + None + """ + + def _normalize_helper(named_tensors: Dict[str, Any]) -> List[Tuple[str, Any]]: + """ + Normalize the named tensors. + + Args: + named_tensors (Dict[str, Any]): The named tensors to normalize. + + Returns: + List[Tuple[str, Any]]: The normalized named tensors. + """ + res = {} + + for name, tensors in named_tensors.items(): + if isinstance(tensors, torch.Tensor): + res[name] = tensors + elif isinstance(tensors, (list, tuple)): + for index, tensor in enumerate(tensors): + res[f"{name}.{index}"] = tensor + elif isinstance(tensors, dict): + for subname, tensor in tensors.items(): + res[f"{name}.{subname}"] = tensor + else: + raise TypeError(f"unsupported normalize value type: {type(tensors)}") + + return list(res.items()) + + def _value_check(tensor_or_tensors): + """ + Check if the input is a tensor or a collection of tensors. + + Args: + tensor_or_tensors (Any): The input to check. + + Returns: + bool: True if the input is a tensor or a collection of tensors, False otherwise. + """ + if torch.is_tensor(tensor_or_tensors): + return True + elif isinstance(tensor_or_tensors, (list, tuple)) and all(torch.is_tensor(x) for x in tensor_or_tensors): + return True + elif isinstance(tensor_or_tensors, dict) and all(torch.is_tensor(x) for x in tensor_or_tensors.values()): + return True + else: + return False + + # Calculate the memory usage of a group of tensors. + for idx, tensors in tensor_groups: + # Normalize the named tensors + named_tensors = {f"{idx}.{k}": v for k, v in tensors.items() if _value_check(v)} + named_tensors = _normalize_helper(named_tensors) + # Calculate the memory usage of the tensors and update the memory state + self._calc_tensor_memory(root_stat, named_tensors) + + +def build_activation_config(num_layers: int, num_chunks: int = 1) -> List[str]: + # TODO: support interleaved pipeline scheduling. + assert num_chunks == 1, "Only support num_chunks == 1" + + if gpc.is_initialized(ParallelMode.PIPELINE): + pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE) + pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE) + else: + pipeline_size = 1 + pipeline_rank = 0 + + all_parts = partition_uniform(num_layers, pipeline_size, num_chunks) + parts = all_parts[pipeline_rank] + start, end = parts[0] + num_blocks = end - start + + block_conf_tmpl = [ + "mixer.rotary_emb", + "mixer.Wqkv", + "mixer.inner_attn", + "mixer.inner_cross_attn", + "mixer.out_proj", + # "dropout1", # skip when dropout_selective_checkpoint is True + # "dropout2", # skip when dropout_selective_checkpoint is True + "norm1", + "norm2", + "mlp.w1", + "mlp.w2", + "mlp.w3", + ] + + block_conf = [] + for block_id in range(num_blocks): + block_conf += [f"blocks.{block_id}.{layer}" for layer in block_conf_tmpl] + + # We don't need to care about whether the embedding, norm, and head layers exist in the model after partitioning. + # If they don't exist, they will be automatically ignored when registering activation trace hooks. + activation_conf = ["embedding", "norm", "head"] + block_conf + + return activation_conf + + +if __name__ == "__main__": + + class SimpleModel(torch.nn.Module): + """ + A simple model with three linear layers. + + Args: + skip_layer2 (bool, optional): Whether to skip layer2. Defaults to False. + """ + + def __init__(self, skip_layer2: bool = False): + super().__init__() + self.layer1 = torch.nn.Linear(5120, 5120, True) + self.layer3 = torch.nn.Linear(5120, 5120, False) + + if skip_layer2: + self.layer2 = None + else: + self.layer2 = SimpleModel(skip_layer2=True) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the model. + + Args: + inputs (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor. + """ + output1 = self.layer1(inputs) + if self.layer2 is not None: + output2 = self.layer2(output1) + else: + output2 = output1 + output = self.layer3(output2) + + return output + + # init model and optimizer + _model: torch.nn.Module = SimpleModel() + _optimizer = torch.optim.Adam(_model.parameters()) + + # create activation config for simple model layer by layer. + activation_configs = [ + # model level 0 + "layer1", + "layer2", + "layer3", + # model level 1 + "layer2.layer1", + "layer2.layer3", + ] + + _model.modules() + + # init profiler + profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler.log", activation_configs) + + _optimizer.zero_grad() + + x1 = torch.randn((128, 5120)) + x2 = torch.randn((128, 5120)) + out1 = _model(x1) + out2 = _model(x2) + out1.mean().backward() + out2.mean().backward() + + _optimizer.step() + + # Update the optimizer state memory usage and record the memory state + profiler.step() diff --git a/internlm/utils/timeout.py b/internlm/utils/timeout.py new file mode 100644 index 0000000..07a0911 --- /dev/null +++ b/internlm/utils/timeout.py @@ -0,0 +1,26 @@ +import signal + + +class Timeout: + """Timer to execute code + + Adapted from https://github.com/reasoning-machines/pal + + Args: + seconds (float): The maximum seconds to execute code + error_message (str) + """ + + def __init__(self, seconds=1, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def timeout_handler(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + signal.signal(signal.SIGALRM, self.timeout_handler) + signal.alarm(self.seconds) + + def __exit__(self, error_type, value, traceback): + signal.alarm(0) diff --git a/requirements/runtime.txt b/requirements/runtime.txt index f7cd137..c0b345f 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -12,4 +12,5 @@ packaging boto3 botocore torch-scatter +pyecharts -f https://data.pyg.org/whl/torch-1.13.0+cu117.html \ No newline at end of file diff --git a/tools/README.md b/tools/README.md index f3ba385..0c78a56 100644 --- a/tools/README.md +++ b/tools/README.md @@ -1,4 +1,5 @@ 本目录提供辅助模型训练的一些工具,文件结构如下所示: + ```bash ├── transformers # 适配hugging face的transformers的一些工具 │ ├── configuration_internlm.py # config适配工具 @@ -9,9 +10,11 @@ ``` # tokenizer.py + 生成原始数据的`bin`和`meta`文件需要使用`tokenizer`,我们通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前我们提供了`V7_sft.model`来生成tokens。若想使用不同的模型,可直接修改`tokernizer.py`中的模型参数路径。 可以运行以下命令生成原始数据对应的`bin`和`meta`文件,其中参数`text_input_path`表示原始文本数据路径,目前支持`txt`、`json`和`jsonl`三种输入格式,`bin_output_path`表示生成的`bin`文件的保存路径。 + ```bash $ python tools/tokenizer.py --text_input_path your_input_text_path --bin_output_path your_output_bin_path ``` @@ -19,6 +22,7 @@ $ python tools/tokenizer.py --text_input_path your_input_text_path --bin_output_ 下面是一个数据处理的例子: 给定一个包含原始数据集的文件`raw_data.txt`,原始数据集如下所示: + ```bash 感恩生活中的每一个细节,才能真正体会到幸福的滋味。 梦想是人生的动力源泉,努力追逐,才能实现自己的目标。 @@ -35,19 +39,73 @@ $ python tools/tokenizer.py --text_input_path raw_data.txt --bin_output_path cn/ 其中,`cn`表示中文数据集;`en`表示英文数据集;`code`表示代码数据集;`ja`表示日语数据集;`ar`表示阿拉伯语数据集;`kaoshi`表示考试数据集。 生成的bin文件的格式如下: + ```python {"tokens": [73075, 75302, 69522, 69022, 98899, 67713, 68015, 81269, 74637, 75445, 99157]} {"tokens": [69469, 60355, 73026, 68524, 60846, 61844, 98899, 67775, 79241, 98899, 67713, 67800, 67453, 67838, 99157]} {"tokens": [68057, 79017, 60378, 68014, 98899, 67713, 67990, 68015, 70381, 67428, 61003, 67622, 99157]} ``` + `bin`文件中的每一行均对应原始数据集中的每一个句子,表示每个句子的`token`(下文将用sequence指定)。 生成的`meta`文件的格式如下: + ```bash (0, 11), (90, 15), (208, 13) ``` + 在`meta`文件中,每个元组对应着`bin`文件中每一个`sequence`的元信息。其中,元组的第一个元素表示每个`sequence`在所有`sequence`中的`starting index`,第二个元素表示每个`sequence`中有多少个`tokens`。 例如,对于第一个`sequence`,`starting index`为 0,有 11 个`tokens`;对于第二个`sequence`,由于第一个`sequence`转换为`string`后的长度为`89`,因此它的`starting index`为 90,有 15 个`tokens`。 -`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致,此处不再赘叙。 \ No newline at end of file +`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致,此处不再赘叙。 + +# pal_inference.py + +在 [GSM8K](https://huggingface.co/datasets/gsm8k) 数据集上使用 [PAL](https://github.com/reasoning-machines/pal) 范式推理,使模型编写代码并通过 Python 解释器执行来解决数学问题。其用法如下: + +```python +# 用法: +python pal_inference.py [--dataset ] [--max_length ] [--top_p ] [--eoh ] [--eoa ] [--eos ] [--temperature ] [--time_out