diff --git a/doc/en/install.md b/doc/en/install.md index b3c42e4..0c721b9 100644 --- a/doc/en/install.md +++ b/doc/en/install.md @@ -5,10 +5,10 @@ The required packages and corresponding version are shown as follows: - Python == 3.10 - GCC == 10.2.0 - MPFR == 4.1.0 -- CUDA == 11.7 -- Pytorch == 1.13.1+cu117 +- CUDA >= 11.7 +- Pytorch >= 1.13.1 - Transformers >= 4.28.0 -- Flash-Attention == v1.0.5 +- Flash-Attention >= v1.0.5 - Apex == 23.05 - GPU with Ampere or Hopper architecture (such as H100, A100) - Linux OS @@ -57,3 +57,14 @@ cd ./third_party/apex pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ cd ../../ ``` + +### Environment Image +Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows: + +```bash +# pull image +docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos +# start container +docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos +docker exec -it myinternlm bash +``` diff --git a/doc/install.md b/doc/install.md index 5af3a95..d5e547e 100644 --- a/doc/install.md +++ b/doc/install.md @@ -5,10 +5,10 @@ - Python == 3.10 - GCC == 10.2.0 - MPFR == 4.1.0 -- CUDA == 11.7 -- Pytorch == 1.13.1+cu117 +- CUDA >= 11.7 +- Pytorch >= 1.13.1 - Transformers >= 4.28.0 -- Flash-Attention == v1.0.5 +- Flash-Attention >= v1.0.5 - Apex == 23.05 - Ampere或者Hopper架构的GPU (例如H100, A100) - Linux OS @@ -57,3 +57,13 @@ cd ./third_party/apex pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ cd ../../ ``` + +### 环境镜像 +用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像,拉取镜像及启动容器的命令如下: +```bash +# 拉取镜像 +docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos +# 启动容器 +docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos +docker exec -it myinternlm bash +``` diff --git a/internlm/solver/lr_scheduler.py b/internlm/solver/lr_scheduler.py index 2e228aa..bcbca88 100644 --- a/internlm/solver/lr_scheduler.py +++ b/internlm/solver/lr_scheduler.py @@ -27,7 +27,7 @@ class WarmupScheduler(_LRScheduler): def state_dict(self): state_dict = {key: value for key, value in self.__dict__.items() if key not in "optimizer"} - if isinstance(state_dict["after_scheduler"], _LRScheduler): + if isinstance(state_dict["after_scheduler"], (_LRScheduler, _CosineAnnealingLR)): state_dict["after_scheduler_type"] = type(state_dict["after_scheduler"]).__name__ state_dict["after_scheduler_dict"] = state_dict["after_scheduler"].state_dict() del state_dict["after_scheduler"] @@ -40,7 +40,7 @@ class WarmupScheduler(_LRScheduler): for key in list(self.__dict__.keys()): if key in state_dict: self.__dict__[key] = state_dict[key] - if isinstance(self.after_scheduler, _LRScheduler): + if isinstance(self.after_scheduler, (_LRScheduler, _CosineAnnealingLR)): assert type(self.after_scheduler).__name__ == state_dict["after_scheduler_type"] # state_dict['after_scheduler_dict'] = state_dict['after_scheduler'].state_dict() self.after_scheduler.load_state_dict(state_dict["after_scheduler_dict"]) diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 7b19ecb..3ee2270 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -1,13 +1,13 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +import math from functools import partial import amp_C import torch import torch.distributed as dist from apex.multi_tensor_apply import multi_tensor_applier -from torch._six import inf from torch.optim import Optimizer from internlm.core.context import Config, ParallelMode @@ -33,6 +33,7 @@ from internlm.utils.logger import get_logger from internlm.utils.megatron_timers import megatron_timer as timer from internlm.utils.parallel import is_model_parallel_parameter +inf = math.inf logger = get_logger(__file__)