mirror of https://github.com/InternLM/InternLM
feat(solver): fix code to adapt to torch2.0 and provide docker images (#128)
* feat(solver): fix code to adapt to torch2.0 * docs(install.md): publish internlm environment image * docs(install.md): update dependency packages version * docs(install.md): update default image --------- Co-authored-by: 黄婷 <huangting3@CN0014010744M.local>pull/133/head
parent
084a841799
commit
26205c1edf
|
@ -5,10 +5,10 @@ The required packages and corresponding version are shown as follows:
|
||||||
- Python == 3.10
|
- Python == 3.10
|
||||||
- GCC == 10.2.0
|
- GCC == 10.2.0
|
||||||
- MPFR == 4.1.0
|
- MPFR == 4.1.0
|
||||||
- CUDA == 11.7
|
- CUDA >= 11.7
|
||||||
- Pytorch == 1.13.1+cu117
|
- Pytorch >= 1.13.1
|
||||||
- Transformers >= 4.28.0
|
- Transformers >= 4.28.0
|
||||||
- Flash-Attention == v1.0.5
|
- Flash-Attention >= v1.0.5
|
||||||
- Apex == 23.05
|
- Apex == 23.05
|
||||||
- GPU with Ampere or Hopper architecture (such as H100, A100)
|
- GPU with Ampere or Hopper architecture (such as H100, A100)
|
||||||
- Linux OS
|
- Linux OS
|
||||||
|
@ -57,3 +57,14 @@ cd ./third_party/apex
|
||||||
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
|
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
|
||||||
cd ../../
|
cd ../../
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Environment Image
|
||||||
|
Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# pull image
|
||||||
|
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
||||||
|
# start container
|
||||||
|
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
||||||
|
docker exec -it myinternlm bash
|
||||||
|
```
|
||||||
|
|
|
@ -5,10 +5,10 @@
|
||||||
- Python == 3.10
|
- Python == 3.10
|
||||||
- GCC == 10.2.0
|
- GCC == 10.2.0
|
||||||
- MPFR == 4.1.0
|
- MPFR == 4.1.0
|
||||||
- CUDA == 11.7
|
- CUDA >= 11.7
|
||||||
- Pytorch == 1.13.1+cu117
|
- Pytorch >= 1.13.1
|
||||||
- Transformers >= 4.28.0
|
- Transformers >= 4.28.0
|
||||||
- Flash-Attention == v1.0.5
|
- Flash-Attention >= v1.0.5
|
||||||
- Apex == 23.05
|
- Apex == 23.05
|
||||||
- Ampere或者Hopper架构的GPU (例如H100, A100)
|
- Ampere或者Hopper架构的GPU (例如H100, A100)
|
||||||
- Linux OS
|
- Linux OS
|
||||||
|
@ -57,3 +57,13 @@ cd ./third_party/apex
|
||||||
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
|
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
|
||||||
cd ../../
|
cd ../../
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 环境镜像
|
||||||
|
用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像,拉取镜像及启动容器的命令如下:
|
||||||
|
```bash
|
||||||
|
# 拉取镜像
|
||||||
|
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
||||||
|
# 启动容器
|
||||||
|
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
||||||
|
docker exec -it myinternlm bash
|
||||||
|
```
|
||||||
|
|
|
@ -27,7 +27,7 @@ class WarmupScheduler(_LRScheduler):
|
||||||
|
|
||||||
def state_dict(self):
|
def state_dict(self):
|
||||||
state_dict = {key: value for key, value in self.__dict__.items() if key not in "optimizer"}
|
state_dict = {key: value for key, value in self.__dict__.items() if key not in "optimizer"}
|
||||||
if isinstance(state_dict["after_scheduler"], _LRScheduler):
|
if isinstance(state_dict["after_scheduler"], (_LRScheduler, _CosineAnnealingLR)):
|
||||||
state_dict["after_scheduler_type"] = type(state_dict["after_scheduler"]).__name__
|
state_dict["after_scheduler_type"] = type(state_dict["after_scheduler"]).__name__
|
||||||
state_dict["after_scheduler_dict"] = state_dict["after_scheduler"].state_dict()
|
state_dict["after_scheduler_dict"] = state_dict["after_scheduler"].state_dict()
|
||||||
del state_dict["after_scheduler"]
|
del state_dict["after_scheduler"]
|
||||||
|
@ -40,7 +40,7 @@ class WarmupScheduler(_LRScheduler):
|
||||||
for key in list(self.__dict__.keys()):
|
for key in list(self.__dict__.keys()):
|
||||||
if key in state_dict:
|
if key in state_dict:
|
||||||
self.__dict__[key] = state_dict[key]
|
self.__dict__[key] = state_dict[key]
|
||||||
if isinstance(self.after_scheduler, _LRScheduler):
|
if isinstance(self.after_scheduler, (_LRScheduler, _CosineAnnealingLR)):
|
||||||
assert type(self.after_scheduler).__name__ == state_dict["after_scheduler_type"]
|
assert type(self.after_scheduler).__name__ == state_dict["after_scheduler_type"]
|
||||||
# state_dict['after_scheduler_dict'] = state_dict['after_scheduler'].state_dict()
|
# state_dict['after_scheduler_dict'] = state_dict['after_scheduler'].state_dict()
|
||||||
self.after_scheduler.load_state_dict(state_dict["after_scheduler_dict"])
|
self.after_scheduler.load_state_dict(state_dict["after_scheduler_dict"])
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
import math
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import amp_C
|
import amp_C
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from apex.multi_tensor_apply import multi_tensor_applier
|
from apex.multi_tensor_apply import multi_tensor_applier
|
||||||
from torch._six import inf
|
|
||||||
from torch.optim import Optimizer
|
from torch.optim import Optimizer
|
||||||
|
|
||||||
from internlm.core.context import Config, ParallelMode
|
from internlm.core.context import Config, ParallelMode
|
||||||
|
@ -33,6 +33,7 @@ from internlm.utils.logger import get_logger
|
||||||
from internlm.utils.megatron_timers import megatron_timer as timer
|
from internlm.utils.megatron_timers import megatron_timer as timer
|
||||||
from internlm.utils.parallel import is_model_parallel_parameter
|
from internlm.utils.parallel import is_model_parallel_parameter
|
||||||
|
|
||||||
|
inf = math.inf
|
||||||
logger = get_logger(__file__)
|
logger = get_logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue