From 5ce6c9d86fe667d7ef5cd70a106b88073b640c20 Mon Sep 17 00:00:00 2001 From: Hongxin Liu Date: Fri, 19 May 2023 12:12:20 +0800 Subject: [PATCH] [doc] add tutorial for cluster utils (#3763) * [doc] add en cluster utils doc * [doc] add zh cluster utils doc * [doc] add cluster utils doc in sidebar --- docs/sidebars.json | 3 +- docs/source/en/features/cluster_utils.md | 32 +++++++++++++++++++ docs/source/zh-Hans/features/cluster_utils.md | 32 +++++++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/features/cluster_utils.md create mode 100644 docs/source/zh-Hans/features/cluster_utils.md diff --git a/docs/sidebars.json b/docs/sidebars.json index 2732704a5..dd3a4e5ec 100644 --- a/docs/sidebars.json +++ b/docs/sidebars.json @@ -58,7 +58,8 @@ ] }, "features/pipeline_parallel", - "features/nvme_offload" + "features/nvme_offload", + "features/cluster_utils" ] }, { diff --git a/docs/source/en/features/cluster_utils.md b/docs/source/en/features/cluster_utils.md new file mode 100644 index 000000000..1903d64d2 --- /dev/null +++ b/docs/source/en/features/cluster_utils.md @@ -0,0 +1,32 @@ +# Cluster Utilities + +Author: [Hongxin Liu](https://github.com/ver217) + +**Prerequisite:** +- [Distributed Training](../concepts/distributed_training.md) + +## Introduction + +We provide a utility class `colossalai.cluster.DistCoordinator` to coordinate distributed training. It's useful to get various information about the cluster, such as the number of nodes, the number of processes per node, etc. + +## API Reference + +{{ autodoc:colossalai.cluster.DistCoordinator }} + +{{ autodoc:colossalai.cluster.DistCoordinator.is_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.is_node_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.is_last_process }} + +{{ autodoc:colossalai.cluster.DistCoordinator.print_on_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.print_on_node_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.priority_execution }} + +{{ autodoc:colossalai.cluster.DistCoordinator.destroy }} + +{{ autodoc:colossalai.cluster.DistCoordinator.block_all }} + +{{ autodoc:colossalai.cluster.DistCoordinator.on_master_only }} diff --git a/docs/source/zh-Hans/features/cluster_utils.md b/docs/source/zh-Hans/features/cluster_utils.md new file mode 100644 index 000000000..ca787a869 --- /dev/null +++ b/docs/source/zh-Hans/features/cluster_utils.md @@ -0,0 +1,32 @@ +# 集群实用程序 + +作者: [Hongxin Liu](https://github.com/ver217) + +**前置教程:** +- [分布式训练](../concepts/distributed_training.md) + +## 引言 + +我们提供了一个实用程序类 `colossalai.cluster.DistCoordinator` 来协调分布式训练。它对于获取有关集群的各种信息很有用,例如节点数、每个节点的进程数等。 + +## API 参考 + +{{ autodoc:colossalai.cluster.DistCoordinator }} + +{{ autodoc:colossalai.cluster.DistCoordinator.is_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.is_node_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.is_last_process }} + +{{ autodoc:colossalai.cluster.DistCoordinator.print_on_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.print_on_node_master }} + +{{ autodoc:colossalai.cluster.DistCoordinator.priority_execution }} + +{{ autodoc:colossalai.cluster.DistCoordinator.destroy }} + +{{ autodoc:colossalai.cluster.DistCoordinator.block_all }} + +{{ autodoc:colossalai.cluster.DistCoordinator.on_master_only }}