mirror of https://github.com/hpcaitech/ColossalAI
aibig-modeldata-parallelismdeep-learningdistributed-computingfoundation-modelsheterogeneous-traininghpcinferencelarge-scalemodel-parallelismpipeline-parallelism
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
131 lines
5.5 KiB
131 lines
5.5 KiB
from math import pow |
|
|
|
import numpy as np |
|
|
|
|
|
def get_submesh_choices(num_hosts, num_devices_per_host, mode="new"): |
|
submesh_choices = [] |
|
i = 1 |
|
p = -1 |
|
while i <= num_devices_per_host: |
|
i *= 2 |
|
p += 1 |
|
assert pow(2, p) == num_devices_per_host, ( |
|
"Only supports the cases where num_devices_per_host is power of two, " |
|
f"while now num_devices_per_host = {num_devices_per_host}" |
|
) |
|
if mode == "alpa": |
|
for i in range(p + 1): |
|
submesh_choices.append((1, pow(2, i))) |
|
for i in range(2, num_hosts + 1): |
|
submesh_choices.append((i, num_devices_per_host)) |
|
elif mode == "new": |
|
for i in range(p // 2 + 1): |
|
for j in range(i, p - i + 1): |
|
submesh_choices.append((pow(2, i), pow(2, j))) |
|
return submesh_choices |
|
|
|
|
|
def alpa_dp_impl( |
|
num_layers, num_devices, num_microbatches, submesh_choices, compute_cost, max_stage_cost, best_configs |
|
): |
|
"""Implementation of Alpa DP for pipeline strategy |
|
Paper reference: https://www.usenix.org/system/files/osdi22-zheng-lianmin.pdf |
|
|
|
Arguments: |
|
num_layers: K |
|
num_devices: N*M |
|
num_microbatches: B |
|
submesh_choices: List[(n_i,m_i)] |
|
compute_cost: t_intra |
|
""" |
|
# For f, layer ID start from 0 |
|
# f[#pipeline stages, layer id that is currently being considered, number of devices used] |
|
f = np.full((num_layers + 1, num_layers + 1, num_devices + 1), np.inf, dtype=np.float32) |
|
f_stage_max = np.full((num_layers + 1, num_layers + 1, num_devices + 1), 0.0, dtype=np.float32) |
|
f_argmin = np.full((num_layers + 1, num_layers + 1, num_devices + 1, 3), -1, dtype=np.int32) |
|
f[0, num_layers, 0] = 0 |
|
for s in range(1, num_layers + 1): |
|
for k in range(num_layers - 1, -1, -1): |
|
for d in range(1, num_devices + 1): |
|
for m, submesh in enumerate(submesh_choices): |
|
n_submesh_devices = np.prod(np.array(submesh)) |
|
if n_submesh_devices <= d: |
|
# TODO: [luzgh]: Why alpa needs max_n_succ_stages? Delete. |
|
# if s - 1 <= max_n_succ_stages[i, k - 1, m, n_config]: |
|
# ... |
|
for i in range(num_layers, k, -1): |
|
stage_cost = compute_cost[k, i, m] |
|
new_cost = f[s - 1, k, d - n_submesh_devices] + stage_cost |
|
if stage_cost <= max_stage_cost and new_cost < f[s, k, d]: |
|
f[s, k, d] = new_cost |
|
f_stage_max[s, k, d] = max(stage_cost, f_stage_max[s - 1, i, d - n_submesh_devices]) |
|
f_argmin[s, k, d] = (i, m, best_configs[k, i, m]) |
|
best_s = -1 |
|
best_total_cost = np.inf |
|
for s in range(1, num_layers + 1): |
|
if f[s, 0, num_devices] < best_total_cost: |
|
best_s = s |
|
best_total_cost = f[s, 0, num_devices] |
|
|
|
if np.isinf(best_total_cost): |
|
return np.inf, None |
|
|
|
total_cost = f[best_s, 0, num_devices] + (num_microbatches - 1) * f_stage_max[best_s, 0, num_devices] |
|
current_s = best_s |
|
current_layer = 0 |
|
current_devices = num_devices |
|
|
|
res = [] |
|
while current_s > 0 and current_layer < num_layers and current_devices > 0: |
|
next_start_layer, submesh_choice, autosharding_choice = f_argmin[current_s, current_layer, current_devices] |
|
assert next_start_layer != -1 and current_devices != -1 |
|
res.append(((current_layer, next_start_layer), submesh_choice, autosharding_choice)) |
|
current_s -= 1 |
|
current_layer = next_start_layer |
|
current_devices -= np.prod(np.array(submesh_choices[submesh_choice])) |
|
assert current_s == 0 and current_layer == num_layers and current_devices == 0 |
|
|
|
return total_cost, res |
|
|
|
|
|
def alpa_dp( |
|
num_layers, num_devices, num_microbatches, submesh_choices, num_autosharding_configs, compute_cost, gap=1e-6 |
|
): |
|
"""Alpa auto stage dynamic programming. |
|
Code reference: https://github.com/alpa-projects/alpa/blob/main/alpa/pipeline_parallel/stage_construction.py |
|
|
|
Arguments: |
|
submesh_choices: List[(int,int)] |
|
num_autosharding_configs: Max number of t_intra(start_layer, end_layer, LogicalMesh) |
|
compute_cost: np.array(num_layers,num_layers,num_submesh_choices,num_autosharding_configs) |
|
""" |
|
assert np.shape(compute_cost) == ( |
|
num_layers, |
|
num_layers, |
|
len(submesh_choices), |
|
num_autosharding_configs, |
|
), "Cost shape wrong." |
|
all_possible_stage_costs = np.sort(np.unique(compute_cost)) |
|
best_cost = np.inf |
|
best_solution = None |
|
last_max_stage_cost = 0.0 |
|
# TODO: [luzgh]: Why alpa needs the num_autosharding_configs dimension in compute_cost? |
|
# In dp_impl it seems the argmin n_config will be chosen. Just amin here. |
|
best_configs = np.argmin(compute_cost, axis=3) |
|
best_compute_cost = np.amin(compute_cost, axis=3) |
|
assert len(all_possible_stage_costs), "no solution in auto stage construction." |
|
for max_stage_cost in all_possible_stage_costs: |
|
if max_stage_cost * num_microbatches >= best_cost: |
|
break |
|
if max_stage_cost - last_max_stage_cost < gap: |
|
continue |
|
cost, solution = alpa_dp_impl( |
|
num_layers, num_devices, num_microbatches, submesh_choices, best_compute_cost, max_stage_cost, best_configs |
|
) |
|
if cost < best_cost: |
|
best_cost = cost |
|
best_solution = solution |
|
last_max_stage_cost = max_stage_cost |
|
|
|
return best_cost, best_solution
|
|
|