#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from functools import partial

import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torchvision.models import resnet50

import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.testing import rerun_if_address_is_in_use
from colossalai.utils import free_port
from colossalai.zero.legacy.init_ctx import ZeroInitContext
from colossalai.zero.legacy.shard_utils import TensorShardStrategy


def run_dist(rank, world_size, port):
    # this test only runs on resnet18
    # as this model has sync batch normalization
    # need to configure cudnn deterministic so that
    # randomness of convolution layers will be disabled
    zero_config = dict(model_config=dict(shard_strategy=TensorShardStrategy()))
    colossalai.launch(config=dict(zero=zero_config, cudnn_determinstic=True, cudnn_benchmark=False),
                      rank=rank,
                      world_size=world_size,
                      host='localhost',
                      port=port,
                      backend='nccl')

    with ZeroInitContext(target_device=torch.cuda.current_device(),
                         shard_strategy=gpc.config.zero.model_config.shard_strategy,
                         shard_param=True):
        model = resnet50()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.CrossEntropyLoss()

    engine, *args = colossalai.initialize(model, optimizer, criterion)

    # train for dummy iterations
    engine.train()
    for _ in range(2):
        data = torch.rand(4, 3, 128, 128).cuda().half()
        label = torch.randint(0, 10, size=(4,)).cuda()
        engine.zero_grad()
        out = engine(data)
        loss = engine.criterion(out, label)
        engine.backward(loss)
        engine.step()

    # test
    # need to make sure the batch norm stats are synchronized
    # so that given the same input, the model will produce the same
    # output on different ranks
    engine.eval()
    data = torch.rand(4, 3, 128, 128).cuda().half()
    dist.broadcast(data, src=0, group=gpc.get_group(ParallelMode.DATA))

    # predict
    out = engine(data)

    # test if results are equal
    tensor_list = [torch.empty_like(out) for _ in range(world_size - 1)]
    tensor_list.insert(rank, out)
    dist.all_gather(tensor_list=tensor_list, tensor=out, group=gpc.get_group(ParallelMode.DATA))

    assert torch.all(tensor_list[0] == tensor_list[1]), \
        'expected the output from different ranks to be the same, but got different values'


@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_sharded_optim_with_sync_bn():
    """
    This test is to make sure that buffers are synchronized between ranks
    when using ZeRO. An example of module buffer is the running stats of
    BatchNormalization layer, i.e. mean and var.

    If the buffers are not synchronized, the model will produce different
    output even though the input and parameters are the same. This is not
    wanted if we are doing predictions.

    """
    world_size = 2
    run_func = partial(run_dist, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
    test_sharded_optim_with_sync_bn()