diff --git a/colossalai/shardformer/policies/falcon.py b/colossalai/shardformer/policies/falcon.py index f2eeb9d69..5c148880f 100644 --- a/colossalai/shardformer/policies/falcon.py +++ b/colossalai/shardformer/policies/falcon.py @@ -49,7 +49,7 @@ class FalconPolicy(Policy): if not self.model.config.new_decoder_architecture and self.model.config.multi_query: warnings.warn( - "Falcon dosen't support tensor parallelism when (not new_decoder_architecture and multi_query) is True, will ignore the tensor parallelism flag." + "Falcon doesn't support tensor parallelism when (not new_decoder_architecture and multi_query) is True, will ignore the tensor parallelism flag." ) self.shard_config.enable_tensor_parallelism = False diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py index 1faa24f71..42bf0825b 100644 --- a/colossalai/shardformer/policies/llama.py +++ b/colossalai/shardformer/policies/llama.py @@ -46,7 +46,7 @@ class LlamaPolicy(Policy): if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False - warnings.warn("Llama dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") + warnings.warn("Llama doesn't support sequence parallelism now, will ignore the sequence parallelism flag.") if self.shard_config.enable_tensor_parallelism: decoder_attribute_replacement = { diff --git a/colossalai/shardformer/policies/mistral.py b/colossalai/shardformer/policies/mistral.py index c16aa6dea..c0b8b3375 100644 --- a/colossalai/shardformer/policies/mistral.py +++ b/colossalai/shardformer/policies/mistral.py @@ -35,7 +35,7 @@ class MistralPolicy(Policy): if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False warnings.warn( - "Mistral dosen't support sequence parallelism now, will ignore the sequence parallelism flag." + "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag." ) if self.shard_config.enable_tensor_parallelism: @@ -136,7 +136,7 @@ class MistralModelPolicy(MistralPolicy): def module_policy(self): if self.pipeline_stage_manager: - warnings.warn("Mistral dosen't support pipeline parallelism now.") + warnings.warn("Mistral doesn't support pipeline parallelism now.") return super().module_policy() @@ -160,7 +160,7 @@ class MistralForCausalLMPolicy(MistralPolicy): } if self.pipeline_stage_manager: - warnings.warn("Mistral dosen't support pipeline parallelism now.") + warnings.warn("Mistral doesn't support pipeline parallelism now.") policy.update(new_item) @@ -186,7 +186,7 @@ class MistralForSequenceClassificationPolicy(MistralPolicy): } if self.pipeline_stage_manager: - warnings.warn("Mistral dosen't support pipeline parallelism now.") + warnings.warn("Mistral doesn't support pipeline parallelism now.") policy.update(new_item) return policy diff --git a/colossalai/shardformer/policies/opt.py b/colossalai/shardformer/policies/opt.py index e2f3a829c..a542808ba 100644 --- a/colossalai/shardformer/policies/opt.py +++ b/colossalai/shardformer/policies/opt.py @@ -59,7 +59,7 @@ class OPTPolicy(Policy): if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False - warnings.warn("OPT dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") + warnings.warn("OPT doesn't support sequence parallelism now, will ignore the sequence parallelism flag.") if self.shard_config.enable_tensor_parallelism: policy[OPTDecoder] = ModulePolicyDescription( diff --git a/colossalai/shardformer/policies/t5.py b/colossalai/shardformer/policies/t5.py index 4d906e3f4..e183b0632 100644 --- a/colossalai/shardformer/policies/t5.py +++ b/colossalai/shardformer/policies/t5.py @@ -66,7 +66,7 @@ class T5BasePolicy(Policy): if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False - warnings.warn("T5 dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") + warnings.warn("T5 doesn't support sequence parallelism now, will ignore the sequence parallelism flag.") if self.shard_config.enable_tensor_parallelism: policy[T5Stack] = ModulePolicyDescription( @@ -263,7 +263,7 @@ class T5BasePolicy(Policy): if num_decoder_layers == 0: return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages - # the number of stages distributed between encoder and decoder is optmized in this way: + # the number of stages distributed between encoder and decoder is optimized in this way: # num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages)) # s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1 def objective(num_encoder_stages): diff --git a/colossalai/shardformer/policies/vit.py b/colossalai/shardformer/policies/vit.py index 6ef0e3b34..584d4e265 100644 --- a/colossalai/shardformer/policies/vit.py +++ b/colossalai/shardformer/policies/vit.py @@ -33,7 +33,7 @@ class ViTPolicy(Policy): if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False - warnings.warn("Vit dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") + warnings.warn("Vit doesn't support sequence parallelism now, will ignore the sequence parallelism flag.") if self.shard_config.enable_tensor_parallelism: policy[ViTEmbeddings] = ModulePolicyDescription( diff --git a/colossalai/shardformer/policies/whisper.py b/colossalai/shardformer/policies/whisper.py index 6dae99e8c..b5b5db79d 100644 --- a/colossalai/shardformer/policies/whisper.py +++ b/colossalai/shardformer/policies/whisper.py @@ -69,13 +69,13 @@ class WhisperPolicy(Policy): if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False warnings.warn( - "Whisper dosen't support sequence parallelism now, will ignore the sequence parallelism flag." + "Whisper doesn't support sequence parallelism now, will ignore the sequence parallelism flag." ) # TODO using the jit fused add_and_dropout affect the accuracy if self.shard_config.enable_jit_fused: self.shard_config.enable_jit_fused = False - warnings.warn("Whisper dosen't support jit fused operator now, will ignore the jit fused operator flag.") + warnings.warn("Whisper doesn't support jit fused operator now, will ignore the jit fused operator flag.") if self.shard_config.enable_tensor_parallelism: policy[WhisperEncoderLayer] = ModulePolicyDescription( @@ -302,7 +302,7 @@ class WhisperPolicy(Policy): if num_decoder_layers == 0: return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages - # the number of stages distributed between encoder and decoder is optmized in this way: + # the number of stages distributed between encoder and decoder is optimized in this way: # num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages)) # s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1 def objective(num_encoder_stages): diff --git a/examples/language/openmoe/model/openmoe_policy.py b/examples/language/openmoe/model/openmoe_policy.py index f354bbea9..17e7aa46c 100644 --- a/examples/language/openmoe/model/openmoe_policy.py +++ b/examples/language/openmoe/model/openmoe_policy.py @@ -43,7 +43,7 @@ class OpenMoePolicy(Policy): if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False raise NotImplementedError( - "openmoe dosen't support sequence parallelism now, will ignore the sequence parallelism flag.") + "openmoe doesn't support sequence parallelism now, will ignore the sequence parallelism flag.") if self.shard_config.enable_tensor_parallelism: raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.")