[misc] update torch version (#6206)

* [misc] update torch version

* fix test

* fix test

* fix test

* fix test
main
Hongxin Liu 2025-02-24 14:35:48 +08:00 committed by GitHub
parent b9e60559b8
commit f32861ccc5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 7 additions and 6 deletions

View File

@ -1,3 +1,3 @@
2.2.2-12.1.0
2.3.0-12.1.0 2.3.0-12.1.0
2.4.0-12.4.1 2.4.0-12.4.1
2.5.1-12.4.1

View File

@ -1,11 +1,11 @@
{ {
"build": [ "build": [
{ {
"torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121", "torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121",
"cuda_image": "hpcaitech/cuda-conda:12.1" "cuda_image": "hpcaitech/cuda-conda:12.1"
}, },
{ {
"torch_command": "pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124", "torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124",
"cuda_image": "hpcaitech/cuda-conda:12.4" "cuda_image": "hpcaitech/cuda-conda:12.4"
} }
] ]

View File

@ -8,7 +8,7 @@ click
fabric fabric
contexttimer contexttimer
ninja ninja
torch>=2.2.0,<=2.4.1 torch>=2.2.0,<=2.5.1
safetensors safetensors
einops einops
pydantic pydantic

View File

@ -1,7 +1,7 @@
from colossalai.cluster.device_mesh_manager import DeviceMeshInfo, DeviceMeshManager from colossalai.cluster.device_mesh_manager import DeviceMeshInfo, DeviceMeshManager
from colossalai.initialize import launch from colossalai.initialize import launch
from colossalai.logging import disable_existing_loggers from colossalai.logging import disable_existing_loggers
from colossalai.testing import spawn from colossalai.testing import rerun_if_address_is_in_use, spawn
def check_device_mesh_manager(rank, world_size, port): def check_device_mesh_manager(rank, world_size, port):
@ -24,6 +24,7 @@ def check_device_mesh_manager(rank, world_size, port):
assert device_mesh_with_shape._logical_mesh_id.tolist() == [[0, 1], [2, 3]] assert device_mesh_with_shape._logical_mesh_id.tolist() == [[0, 1], [2, 3]]
@rerun_if_address_is_in_use()
def test_device_mesh_manager(): def test_device_mesh_manager():
spawn(check_device_mesh_manager, 4) spawn(check_device_mesh_manager, 4)

View File

@ -51,7 +51,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
if test_config["precision"] == "fp32": if test_config["precision"] == "fp32":
atol, rtol = 1e-5, 1e-3 atol, rtol = 1e-5, 1e-3
else: else:
atol, rtol = 5e-2, 5e-2 atol, rtol = 9e-2, 0
if (stage_manager is None or stage_manager.is_first_stage()) and booster.plugin.zero_stage == 0: if (stage_manager is None or stage_manager.is_first_stage()) and booster.plugin.zero_stage == 0:
row_layer_grads = get_grad_tensors_for_check( row_layer_grads = get_grad_tensors_for_check(
t5, sharded_t5, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0 t5, sharded_t5, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0