From f32861ccc59970a0a6d9650a3332c1267e20c83e Mon Sep 17 00:00:00 2001 From: Hongxin Liu Date: Mon, 24 Feb 2025 14:35:48 +0800 Subject: [PATCH] [misc] update torch version (#6206) * [misc] update torch version * fix test * fix test * fix test * fix test --- .compatibility | 2 +- .cuda_ext.json | 4 ++-- requirements/requirements.txt | 2 +- tests/test_cluster/test_device_mesh_manager.py | 3 ++- tests/test_shardformer/test_model/test_shard_t5.py | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.compatibility b/.compatibility index e1836506a..69d483524 100644 --- a/.compatibility +++ b/.compatibility @@ -1,3 +1,3 @@ -2.2.2-12.1.0 2.3.0-12.1.0 2.4.0-12.4.1 +2.5.1-12.4.1 diff --git a/.cuda_ext.json b/.cuda_ext.json index 1e617755b..01a30a9c1 100644 --- a/.cuda_ext.json +++ b/.cuda_ext.json @@ -1,11 +1,11 @@ { "build": [ { - "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121", + "torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121", "cuda_image": "hpcaitech/cuda-conda:12.1" }, { - "torch_command": "pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124", + "torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124", "cuda_image": "hpcaitech/cuda-conda:12.4" } ] diff --git a/requirements/requirements.txt b/requirements/requirements.txt index f357c45fd..688c47cc2 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -8,7 +8,7 @@ click fabric contexttimer ninja -torch>=2.2.0,<=2.4.1 +torch>=2.2.0,<=2.5.1 safetensors einops pydantic diff --git a/tests/test_cluster/test_device_mesh_manager.py b/tests/test_cluster/test_device_mesh_manager.py index 5d140064b..c4a92a138 100644 --- a/tests/test_cluster/test_device_mesh_manager.py +++ b/tests/test_cluster/test_device_mesh_manager.py @@ -1,7 +1,7 @@ from colossalai.cluster.device_mesh_manager import DeviceMeshInfo, DeviceMeshManager from colossalai.initialize import launch from colossalai.logging import disable_existing_loggers -from colossalai.testing import spawn +from colossalai.testing import rerun_if_address_is_in_use, spawn def check_device_mesh_manager(rank, world_size, port): @@ -24,6 +24,7 @@ def check_device_mesh_manager(rank, world_size, port): assert device_mesh_with_shape._logical_mesh_id.tolist() == [[0, 1], [2, 3]] +@rerun_if_address_is_in_use() def test_device_mesh_manager(): spawn(check_device_mesh_manager, 4) diff --git a/tests/test_shardformer/test_model/test_shard_t5.py b/tests/test_shardformer/test_model/test_shard_t5.py index 6cdf5bf41..40b4e368d 100644 --- a/tests/test_shardformer/test_model/test_shard_t5.py +++ b/tests/test_shardformer/test_model/test_shard_t5.py @@ -51,7 +51,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, if test_config["precision"] == "fp32": atol, rtol = 1e-5, 1e-3 else: - atol, rtol = 5e-2, 5e-2 + atol, rtol = 9e-2, 0 if (stage_manager is None or stage_manager.is_first_stage()) and booster.plugin.zero_stage == 0: row_layer_grads = get_grad_tensors_for_check( t5, sharded_t5, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0