diff --git a/colossalai/gemini/memory_tracer/memstats_collector.py b/colossalai/gemini/memory_tracer/memstats_collector.py index a09844755..07ef74fbf 100644 --- a/colossalai/gemini/memory_tracer/memstats_collector.py +++ b/colossalai/gemini/memory_tracer/memstats_collector.py @@ -1,6 +1,6 @@ -from colossalai.gemini.memory_tracer import GLOBAL_MODEL_DATA_TRACER from colossalai.gemini.memory_tracer import SyncCudaMemoryMonitor from colossalai.utils.memory import colo_device_memory_used +from colossalai.gemini.stateful_tensor import StatefulTensor import torch import time @@ -92,7 +92,8 @@ class MemStatsCollector: """Sampling model data statistics. """ if self._start_flag: - cuda_mem, cpu_mem = GLOBAL_MODEL_DATA_TRACER.both_mem_usage + cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda'] + cpu_mem = StatefulTensor.GST_MGR.total_mem['cpu'] self._model_data_cuda_list.append(cuda_mem) self._model_data_cpu_list.append(cpu_mem) @@ -114,24 +115,6 @@ class MemStatsCollector: self._sampling_time.append(time.time()) self._mem_monitor.start() - def sample_memstats(self) -> None: - """ - Sampling memory statistics. - Record the current model data CUDA memory usage as well as system CUDA memory usage. - Advance the sampling cnter. - """ - if self._start_flag: - self._model_data_cuda_list.append(GLOBAL_MODEL_DATA_TRACER.cuda_usage) - self._overall_cuda_list.append(self._mem_monitor.finish()) - self._non_model_data_cuda_list.append(self._overall_cuda_list[-1] - self._model_data_cuda_list[-1]) - - self._model_data_cpu_list.append(GLOBAL_MODEL_DATA_TRACER.cpu_usage) - # FIXME(jiaruifang) cpu sys used should also return from self._mem_monitor() - self._overall_cpu_list.append(colo_device_memory_used(torch.device(f'cpu'))) - self._non_model_data_cpu_list.append(self._overall_cpu_list[-1] - self._model_data_cpu_list[-1]) - self._sampling_time.append(time.time()) - self._mem_monitor.start() - def clear(self) -> None: self._model_data_cuda_list = [] self._overall_cuda_list = [] diff --git a/colossalai/gemini/tensor_placement_policy.py b/colossalai/gemini/tensor_placement_policy.py index b35417c7b..3ef52b439 100644 --- a/colossalai/gemini/tensor_placement_policy.py +++ b/colossalai/gemini/tensor_placement_policy.py @@ -7,7 +7,6 @@ from colossalai.utils.memory import colo_device_memory_capacity from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage from colossalai.gemini.stateful_tensor import StatefulTensor from colossalai.gemini.memory_tracer import MemStatsCollector -from colossalai.gemini.memory_tracer import GLOBAL_MODEL_DATA_TRACER from typing import Type @@ -79,7 +78,7 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy): """ volume = 0 cuda_capacity = colo_device_memory_capacity(get_current_device()) - used_cuda_model_data = GLOBAL_MODEL_DATA_TRACER.cuda_usage + used_cuda_model_data = StatefulTensor.GST_MGR.total_mem['cuda'] if warmup: # We designate a part of CUDA memory for model data in warmup iterations. max_cuda_non_model_data_per_period = cuda_capacity * self._warmup_non_model_data_ratio diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py index 7dd1ec3e3..0f958aaea 100644 --- a/colossalai/zero/sharded_model/sharded_model_v2.py +++ b/colossalai/zero/sharded_model/sharded_model_v2.py @@ -13,8 +13,6 @@ from colossalai.engine.paramhooks import BaseParamHookMgr from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device, disposable from colossalai.gemini.memory_tracer.memstats_collector import MemStatsCollector -from colossalai.gemini.memory_tracer.model_data_memtracer import \ - GLOBAL_MODEL_DATA_TRACER from colossalai.utils.memory import colo_device_memory_capacity from colossalai.zero.shard_utils import BaseShardStrategy from colossalai.zero.sharded_model.reduce_scatter import ReduceScatterBucketer @@ -106,7 +104,6 @@ class ShardedModelV2(nn.Module): self._use_memory_tracer = tensor_placement_policy == 'auto' if self._use_memory_tracer: - GLOBAL_MODEL_DATA_TRACER.register_model(self) self._memstats_collector = MemStatsCollector() self._start_collect_memstats = disposable(self._memstats_collector.start_collection) self._finish_collect_memstats = disposable(self._memstats_collector.finish_collection) diff --git a/colossalai/zero/sharded_optim/sharded_optim_v2.py b/colossalai/zero/sharded_optim/sharded_optim_v2.py index 2a4a69e41..884619b1c 100644 --- a/colossalai/zero/sharded_optim/sharded_optim_v2.py +++ b/colossalai/zero/sharded_optim/sharded_optim_v2.py @@ -10,10 +10,7 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.logging import get_dist_logger from colossalai.nn.optimizer import ColossalaiOptimizer -from colossalai.gemini.memory_tracer.model_data_memtracer import \ - GLOBAL_MODEL_DATA_TRACER -from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_model_tensor_clone, - colo_tensor_mem_usage) +from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_tensor_mem_usage) from colossalai.zero.sharded_model import ShardedModelV2 from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32 from torch import Tensor @@ -130,8 +127,6 @@ class ShardedOptimizerV2(ColossalaiOptimizer): f"After init ShardedOptimizerV2 consumes {self.get_memory_usage()[0] / 1e6} MB CUDA Memory!", ranks=[0]) self._use_memory_tracer = self.model.use_memory_tracer - if self._use_memory_tracer: - GLOBAL_MODEL_DATA_TRACER.register_optimizer(self) @property def loss_scale(self):