|
|
|
@ -15,7 +15,6 @@ CONFIG = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_train(): |
|
|
|
|
assert non_distributed_component_funcs.get_callable('bert') |
|
|
|
|
for get_components_func in non_distributed_component_funcs: |
|
|
|
|
model_builder, train_dataloader, _, optimizer_builder, criterion = get_components_func() |
|
|
|
|
|
|
|
|
@ -27,12 +26,15 @@ def run_train():
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
engine.train() |
|
|
|
|
for img, label in train_dataloader: |
|
|
|
|
for data, label in train_dataloader: |
|
|
|
|
engine.zero_grad() |
|
|
|
|
img = img.cuda() |
|
|
|
|
data = data.cuda() |
|
|
|
|
label = label.cuda() |
|
|
|
|
output = engine(img) |
|
|
|
|
loss = engine.criterion(output, label) |
|
|
|
|
if criterion: |
|
|
|
|
output = engine(data) |
|
|
|
|
loss = engine.criterion(output, label) |
|
|
|
|
else: |
|
|
|
|
loss = engine(data, label) |
|
|
|
|
engine.backward(loss) |
|
|
|
|
engine.step() |
|
|
|
|
break |
|
|
|
@ -72,9 +74,9 @@ def run_engine(rank, world_size, port):
|
|
|
|
|
# init dist env |
|
|
|
|
colossalai.launch(config=dict(), rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') |
|
|
|
|
run_with_no_amp() |
|
|
|
|
# run_with_torch_amp() |
|
|
|
|
# run_with_apex_amp() |
|
|
|
|
# run_with_naive_amp() |
|
|
|
|
run_with_torch_amp() |
|
|
|
|
run_with_apex_amp() |
|
|
|
|
run_with_naive_amp() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.dist |
|
|
|
|