[hotfix] fix chunk size can not be divided (#2867)

* [hotfix] fix chunk size can not be divided

* [hotfix] use numpy for python3.8
pull/2876/head^2
HELSON 2 years ago committed by GitHub
parent a4fc125c34
commit 6e4ac08172
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -72,6 +72,9 @@ class ChunkManager:
if tensor.numel() > chunk_size: if tensor.numel() > chunk_size:
chunk_size = tensor.numel() chunk_size = tensor.numel()
dp_size = tensor.process_group.dp_world_size()
chunk_size = chunk_size + (-chunk_size % dp_size)
chunk = Chunk( chunk = Chunk(
chunk_size=chunk_size, chunk_size=chunk_size,
process_group=tensor.process_group, process_group=tensor.process_group,

@ -119,6 +119,7 @@ def search_chunk_configuration(
assert search_range_byte >= 0 assert search_range_byte >= 0
params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag) params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
size_lcm = np.lcm.reduce(list(params_dict.keys()))
config_dict: Dict[int, Dict] = dict() config_dict: Dict[int, Dict] = dict()
total_param_size = 0 total_param_size = 0
@ -154,6 +155,8 @@ def search_chunk_configuration(
min_chunk_waste = temp_waste min_chunk_waste = temp_waste
best_chunk_size = chunk_size best_chunk_size = chunk_size
# the chunk size needs to be divided by each groups sizes
best_chunk_size = best_chunk_size + (-best_chunk_size % size_lcm)
for dp_degree in params_dict: for dp_degree in params_dict:
if dp_degree in config_dict: if dp_degree in config_dict:
continue continue

Loading…
Cancel
Save