from functools import partial
from typing import Dict, List

import click
import torch.multiprocessing as mp

import colossalai
from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
from colossalai.context import Config
from colossalai.context.random import reset_seeds
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.testing import free_port
from colossalai.utils import MultiTimer

from .models import MLP


def run_benchmark(args: Config) -> None:
    """
    Run benchmarking with torch.multiprocessing.
    """

    # sanity checks
    if args.gpus is None:
        click.echo("Error: --num_gpus is not given")
        exit()
    if args.gpus <= 1:
        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")

    click.echo("=== Benchmarking Parameters ===")
    for k, v in args.items():
        click.echo(f'{k}: {v}')
    click.echo('')

    config_list = find_all_configs(args.gpus)

    avail_ports = [free_port() for _ in range(len(config_list))]
    run_func = partial(run_dist_profiling,
                       world_size=args.gpus,
                       port_list=avail_ports,
                       config_list=config_list,
                       hyperparams=args)
    mp.spawn(run_func, nprocs=args.gpus)


def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
                       hyperparams: Config) -> None:
    """
    A function executed for profiling, this function should be spawn by torch.multiprocessing.

    Args:
        rank (int): rank of the process
        world_size (int): the number of processes
        port_list (List[int]): a list of free ports for initializing distributed networks
        config_list (List[Dict]): a list of configuration
        hyperparams (Config): the hyperparameters given by the user

    """

    # disable logging for clean output
    disable_existing_loggers()
    logger = get_dist_logger()
    logger.set_level('WARNING')

    for config, port in zip(config_list, port_list):
        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
        timer = MultiTimer()

        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
            click.echo(
                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
            )
            continue

        if hyperparams.model == 'mlp':
            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
        else:
            if gpc.get_global_rank() == 0:
                click.echo("Error: Invalid argument for --model")
                exit()

        data_func = partial(get_batch_data,
                            dim=hyperparams.dimension,
                            batch_size=hyperparams.batch_size,
                            seq_length=hyperparams.seq_len,
                            mode=config.parallel.tensor.mode)

        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
                                                                      warmup_steps=hyperparams.warmup_steps,
                                                                      profile_steps=hyperparams.profile_steps,
                                                                      data_func=data_func,
                                                                      timer=timer)

        gpc.destroy()
        reset_seeds()

        if gpc.get_global_rank() == 0:
            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
            click.echo(f"=== {config_str} ===")
            click.echo(f"Average forward time: {fwd_time}")
            click.echo(f"Average backward time: {bwd_time}")
            click.echo(f"Max allocated GPU memory: {max_allocated}")
            click.echo(f"Max cached GPU memory: {max_cached}\n")