diff --git a/applications/Chat/coati/kernels/opt_attn.py b/applications/Chat/coati/kernels/opt_attn.py index c10f341e9..e99f9c224 100644 --- a/applications/Chat/coati/kernels/opt_attn.py +++ b/applications/Chat/coati/kernels/opt_attn.py @@ -77,7 +77,7 @@ class XOPTAttention(OPTAttention): scale=self.scaling) # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be - # partitioned aross GPUs when using tensor-parallelism. + # partitioned across GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py index 0200cd3c6..1f20fca4f 100644 --- a/colossalai/communication/p2p.py +++ b/colossalai/communication/p2p.py @@ -217,7 +217,7 @@ def recv_backward(output_grad_shape, next_rank (int, optional): The rank of the source of the tensor. Returns: - Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradident tensor list. + Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradient tensor list. """ if gpc.is_pipeline_last_stage(): output_tensor_grad = None diff --git a/colossalai/communication/p2p_v2.py b/colossalai/communication/p2p_v2.py index 0dacd8c3c..090311cb3 100644 --- a/colossalai/communication/p2p_v2.py +++ b/colossalai/communication/p2p_v2.py @@ -19,7 +19,7 @@ _unpickler = pickle.Unpickler def init_process_group(): - """intialise process group by dist.new_group in the adjacent stages + """initialise process group by dist.new_group in the adjacent stages Args: None diff --git a/colossalai/context/process_group_initializer/initializer_sequence.py b/colossalai/context/process_group_initializer/initializer_sequence.py index eaacb14d2..251a29407 100644 --- a/colossalai/context/process_group_initializer/initializer_sequence.py +++ b/colossalai/context/process_group_initializer/initializer_sequence.py @@ -91,11 +91,11 @@ class Initializer_Sequence(ProcessGroupInitializer): parallel_setting = [] - local_rank, group_world_size, process_group, cpu_grop, ranks_in_group, mode = \ + local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode = \ self._sequence_initializer.init_dist_group() # change mode to sequence mode = ParallelMode.SEQUENCE - parallel_setting.append((local_rank, group_world_size, process_group, cpu_grop, ranks_in_group, mode)) + parallel_setting.append((local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode)) parallel_setting.append(self._sequence_dp_initializer.init_dist_group()) return parallel_setting diff --git a/examples/tutorial/new_api/cifar_resnet/train.py b/examples/tutorial/new_api/cifar_resnet/train.py index a96a4b640..fe0dabf08 100644 --- a/examples/tutorial/new_api/cifar_resnet/train.py +++ b/examples/tutorial/new_api/cifar_resnet/train.py @@ -28,7 +28,7 @@ LEARNING_RATE = 1e-3 def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase): - # trainsform + # transform transform_train = transforms.Compose( [transforms.Pad(4), transforms.RandomHorizontalFlip(), diff --git a/examples/tutorial/new_api/cifar_vit/train.py b/examples/tutorial/new_api/cifar_vit/train.py index 2405fdfc6..82a8f2ed9 100644 --- a/examples/tutorial/new_api/cifar_vit/train.py +++ b/examples/tutorial/new_api/cifar_vit/train.py @@ -25,7 +25,7 @@ from colossalai.utils import get_current_device # Prepare Hyperparameters # ============================== NUM_EPOCHS = 60 -WARMUP_EPOCSH = 5 +WARMUP_EPOCHS = 5 LEARNING_RATE = 1e-3 @@ -37,7 +37,7 @@ def vit_cifar(**kwargs): def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase): - # trainsform + # transform transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), @@ -177,7 +177,7 @@ def main(): optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE) # lr scheduler - lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCSH) + lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCHS) # ============================== # Boost with ColossalAI diff --git a/op_builder/utils.py b/op_builder/utils.py index 1b1bd5f49..2dbd976fb 100644 --- a/op_builder/utils.py +++ b/op_builder/utils.py @@ -36,7 +36,7 @@ def get_cuda_version_in_pytorch() -> List[int]: torch_cuda_minor = torch.version.cuda.split(".")[1] except: raise ValueError( - "[extension] Cannot retrive the CUDA version in the PyTorch binary given by torch.version.cuda") + "[extension] Cannot retrieve the CUDA version in the PyTorch binary given by torch.version.cuda") return torch_cuda_major, torch_cuda_minor diff --git a/tests/components_to_test/albert.py b/tests/components_to_test/albert.py index d5b6bc89a..52b2275ec 100644 --- a/tests/components_to_test/albert.py +++ b/tests/components_to_test/albert.py @@ -28,7 +28,7 @@ def get_training_components(): print('building AlbertForSequenceClassification model') # adapting huggingface BertForSequenceClassification for single unitest calling interface - class ModelAaptor(AlbertForSequenceClassification): + class ModelAdaptor(AlbertForSequenceClassification): def forward(self, input_ids, labels): """ @@ -37,23 +37,23 @@ def get_training_components(): """ return super().forward(input_ids=input_ids, labels=labels)[0] - model = ModelAaptor(config) + model = ModelAdaptor(config) # if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"): # model.gradient_checkpointing_enable() return model - is_distrbuted = torch.distributed.is_initialized() + is_distributed = torch.distributed.is_initialized() trainloader = get_bert_data_loader(n_class=vocab_size, batch_size=2, total_samples=10000, sequence_length=sequence_length, - is_distrbuted=is_distrbuted) + is_distributed=is_distributed) testloader = get_bert_data_loader(n_class=vocab_size, batch_size=2, total_samples=10000, sequence_length=sequence_length, - is_distrbuted=is_distrbuted) + is_distributed=is_distributed) criterion = None return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion diff --git a/tests/components_to_test/beit.py b/tests/components_to_test/beit.py index 1252071f4..2021ae6f6 100644 --- a/tests/components_to_test/beit.py +++ b/tests/components_to_test/beit.py @@ -27,7 +27,7 @@ class DummyDataLoader(DummyDataGenerator): @non_distributed_component_funcs.register(name='beit') def get_training_components(): - def model_buider(checkpoint=False): + def model_builder(checkpoint=False): model = Beit(img_size=DummyDataLoader.img_size, num_classes=DummyDataLoader.num_class, embed_dim=32, @@ -39,4 +39,4 @@ def get_training_components(): testloader = DummyDataLoader() criterion = torch.nn.CrossEntropyLoss() - return model_buider, trainloader, testloader, torch.optim.Adam, criterion + return model_builder, trainloader, testloader, torch.optim.Adam, criterion diff --git a/tests/components_to_test/bert.py b/tests/components_to_test/bert.py index c1faa6f9d..e7d1d5080 100644 --- a/tests/components_to_test/bert.py +++ b/tests/components_to_test/bert.py @@ -13,7 +13,7 @@ def get_bert_data_loader( total_samples, sequence_length, device=torch.device('cpu:0'), - is_distrbuted=False, + is_distributed=False, ): train_data = torch.randint( low=0, @@ -24,7 +24,7 @@ def get_bert_data_loader( ) train_label = torch.randint(low=0, high=2, size=(total_samples,), device=device, dtype=torch.long) train_dataset = torch.utils.data.TensorDataset(train_data, train_label) - if is_distrbuted: + if is_distributed: sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: sampler = SequentialSampler(train_dataset) @@ -52,8 +52,8 @@ def get_training_components(): attention_probs_dropout_prob=0.) print('building BertForSequenceClassification model') - # adapting huggingface BertForSequenceClassification for single unitest calling interface - class ModelAaptor(BertForSequenceClassification): + # adapting huggingface BertForSequenceClassification for single unittest calling interface + class ModelAdaptor(BertForSequenceClassification): def forward(self, input_ids, labels): """ @@ -62,23 +62,23 @@ def get_training_components(): """ return super().forward(input_ids=input_ids, labels=labels)[0] - model = ModelAaptor(config) + model = ModelAdaptor(config) if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"): model.gradient_checkpointing_enable() return model - is_distrbuted = torch.distributed.is_initialized() + is_distributed = torch.distributed.is_initialized() trainloader = get_bert_data_loader(n_class=vocab_size, batch_size=2, total_samples=10000, sequence_length=sequence_length, - is_distrbuted=is_distrbuted) + is_distributed=is_distributed) testloader = get_bert_data_loader(n_class=vocab_size, batch_size=2, total_samples=10000, sequence_length=sequence_length, - is_distrbuted=is_distrbuted) + is_distributed=is_distributed) criterion = None return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion diff --git a/tests/components_to_test/registry.py b/tests/components_to_test/registry.py index 728ed9eba..edfcaaa72 100644 --- a/tests/components_to_test/registry.py +++ b/tests/components_to_test/registry.py @@ -9,10 +9,10 @@ class Registry: def register(self, name): assert name not in self._registry - def _regsiter(callable_): + def _register(callable_): self._registry[name] = callable_ - return _regsiter + return _register def get_callable(self, name: str): return self._registry[name] @@ -34,6 +34,6 @@ class Registry: non_distributed_component_funcs = Registry() -model_paralle_component_funcs = Registry() +model_parallel_component_funcs = Registry() -__all__ = ['non_distributed_component_funcs', 'model_paralle_component_funcs'] +__all__ = ['non_distributed_component_funcs', 'model_parallel_component_funcs'] diff --git a/tests/test_utils/test_activation_checkpointing.py b/tests/test_utils/test_activation_checkpointing.py index 59a8acd4b..2930552cc 100644 --- a/tests/test_utils/test_activation_checkpointing.py +++ b/tests/test_utils/test_activation_checkpointing.py @@ -51,7 +51,7 @@ def test_activation_checkpointing(cpu_offload, use_reentrant): # other tests might affect this test reset_seeds() - # We put initilization here to avoid change cuda rng state below + # We put initialization here to avoid change cuda rng state below inputs = torch.rand(2, 2, requires_grad=True, device='cuda') weight = torch.rand(2, 4, requires_grad=True, device='cuda') diff --git a/tests/test_utils/test_checkpoint_io/test_load.py b/tests/test_utils/test_checkpoint_io/test_load.py index b1a741515..2949c9f07 100644 --- a/tests/test_utils/test_checkpoint_io/test_load.py +++ b/tests/test_utils/test_checkpoint_io/test_load.py @@ -23,7 +23,7 @@ def check_model_state_dict(a: Dict[str, Tensor], b: Dict[str, Tensor]) -> None: assert torch.equal(v, b[k]) -def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) -> None: +def check_optim_state_dict(a: dict, b: dict, ignore_param_groups: bool = False) -> None: assert set(a['state'].keys()) == set(b['state'].keys()) for k, state in a['state'].items(): b_state = b['state'][k] @@ -32,7 +32,7 @@ def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) assert torch.equal(v1, v2) else: assert v1 == v2 - if not ignore_param_gruops: + if not ignore_param_groups: assert a['param_groups'] == b['param_groups'] @@ -129,23 +129,23 @@ def launch_dist(fn, world_size: int): def save_dist(dir_name: str, zero: bool): - model, optmizer = prepare_model_optim(shard=True, zero=zero) - reset_model_optim(model, optmizer) + model, optimizer = prepare_model_optim(shard=True, zero=zero) + reset_model_optim(model, optimizer) world_size = dist.get_world_size() rank = dist.get_rank() - save(dir_name, model, optmizer, dist_meta=get_dist_metas(world_size, zero)[rank]) + save(dir_name, model, optimizer, dist_meta=get_dist_metas(world_size, zero)[rank]) def load_and_check_dist(dir_name: str): world_size = dist.get_world_size() - model, optmizer = prepare_model_optim(shard=True) - reset_model_optim(model, optmizer) + model, optimizer = prepare_model_optim(shard=True) + reset_model_optim(model, optimizer) model_state_dict = deepcopy(model.state_dict()) - optimizer_state_dict = deepcopy(optmizer.state_dict()) - reset_model_optim(model, optmizer, 1) - load(dir_name, model, optmizer, get_redist_meta(world_size), get_dist_metas(world_size)) + optimizer_state_dict = deepcopy(optimizer.state_dict()) + reset_model_optim(model, optimizer, 1) + load(dir_name, model, optimizer, get_redist_meta(world_size), get_dist_metas(world_size)) check_model_state_dict(model_state_dict, model.state_dict()) - check_optim_state_dict(optimizer_state_dict, optmizer.state_dict()) + check_optim_state_dict(optimizer_state_dict, optimizer.state_dict()) @pytest.mark.dist diff --git a/tests/test_utils/test_checkpoint_io/test_merge.py b/tests/test_utils/test_checkpoint_io/test_merge.py index 255c74adf..07d4597f8 100644 --- a/tests/test_utils/test_checkpoint_io/test_merge.py +++ b/tests/test_utils/test_checkpoint_io/test_merge.py @@ -68,7 +68,7 @@ def run_dist(rank, world_size, port, test_fn): def run_save_dist(dir_name: str, zero: bool): - model, optmizer = prepare_model_optim(shard=True, zero=zero) + model, optimizer = prepare_model_optim(shard=True, zero=zero) rank = dist.get_rank() dp_world_size = dist.get_world_size() // 2 if not zero: @@ -90,7 +90,7 @@ def run_save_dist(dir_name: str, zero: bool): 'fc.bias': ParamDistMeta(rank // 2, dp_world_size, 0, 1, zero_numel=1, zero_orig_shape=[1]) } - save(dir_name, model, optmizer, dist_meta=dist_metas) + save(dir_name, model, optimizer, dist_meta=dist_metas) @pytest.mark.dist diff --git a/tests/test_utils/test_checkpoint_io/test_redist.py b/tests/test_utils/test_checkpoint_io/test_redist.py index 144715bdf..fdc849a5e 100644 --- a/tests/test_utils/test_checkpoint_io/test_redist.py +++ b/tests/test_utils/test_checkpoint_io/test_redist.py @@ -125,9 +125,9 @@ def run_dist(rank, world_size, port, test_fn): def run_save_dist(dir_name: str, zero: bool): - model, optmizer = prepare_model_optim(shard=True, zero=zero) + model, optimizer = prepare_model_optim(shard=True, zero=zero) rank = dist.get_rank() - save(dir_name, model, optmizer, dist_meta=get_dist_metas(4, zero)[rank]) + save(dir_name, model, optimizer, dist_meta=get_dist_metas(4, zero)[rank]) @pytest.mark.dist diff --git a/tests/test_utils/test_checkpoint_io/test_save.py b/tests/test_utils/test_checkpoint_io/test_save.py index e35e566f6..2abdd95a6 100644 --- a/tests/test_utils/test_checkpoint_io/test_save.py +++ b/tests/test_utils/test_checkpoint_io/test_save.py @@ -28,7 +28,7 @@ def check_model_state_dict(a: Dict[str, Tensor], b: Dict[str, Tensor]) -> None: assert torch.equal(v, b[k]) -def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) -> None: +def check_optim_state_dict(a: dict, b: dict, ignore_param_groups: bool = False) -> None: assert set(a['state'].keys()) == set(b['state'].keys()) for k, state in a['state'].items(): b_state = b['state'][k] @@ -37,7 +37,7 @@ def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) assert torch.equal(v1, v2) else: assert v1 == v2 - if not ignore_param_gruops: + if not ignore_param_groups: assert a['param_groups'] == b['param_groups'] @@ -113,12 +113,12 @@ def run_dist(rank, world_size, port, test_fn): def run_save_dist(dir_name): - model, optmizer = prepare_model_optim() + model, optimizer = prepare_model_optim() dist_metas = { 'fc.weight': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1), 'fc.bias': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1) } - save(dir_name, model, optmizer, dist_meta=dist_metas) + save(dir_name, model, optimizer, dist_meta=dist_metas) @pytest.mark.dist diff --git a/tests/test_utils/test_lazy_init/utils.py b/tests/test_utils/test_lazy_init/utils.py index a8aeb4c89..0b5f15ca5 100644 --- a/tests/test_utils/test_lazy_init/utils.py +++ b/tests/test_utils/test_lazy_init/utils.py @@ -18,7 +18,7 @@ def set_seed(seed: int) -> None: torch.manual_seed(seed) -def assert_model_eqaual(m1: torch.nn.Module, m2: torch.nn.Module) -> None: +def assert_model_equal(m1: torch.nn.Module, m2: torch.nn.Module) -> None: s1 = m1.state_dict() s2 = m2.state_dict() @@ -63,7 +63,7 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False, with ctx: deferred_model = model_fn() deferred_model = ctx.materialize(deferred_model, verbose=verbose) - assert_model_eqaual(model, deferred_model) + assert_model_equal(model, deferred_model) if check_forward: assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn) if verbose: