mirror of https://github.com/hpcaitech/ColossalAI
fix typo change dosen't to doesn't (#5308)
parent
6a3086a505
commit
71321a07cf
|
@ -49,7 +49,7 @@ class FalconPolicy(Policy):
|
|||
|
||||
if not self.model.config.new_decoder_architecture and self.model.config.multi_query:
|
||||
warnings.warn(
|
||||
"Falcon dosen't support tensor parallelism when (not new_decoder_architecture and multi_query) is True, will ignore the tensor parallelism flag."
|
||||
"Falcon doesn't support tensor parallelism when (not new_decoder_architecture and multi_query) is True, will ignore the tensor parallelism flag."
|
||||
)
|
||||
self.shard_config.enable_tensor_parallelism = False
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ class LlamaPolicy(Policy):
|
|||
|
||||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
warnings.warn("Llama dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
warnings.warn("Llama doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
decoder_attribute_replacement = {
|
||||
|
|
|
@ -35,7 +35,7 @@ class MistralPolicy(Policy):
|
|||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
warnings.warn(
|
||||
"Mistral dosen't support sequence parallelism now, will ignore the sequence parallelism flag."
|
||||
"Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
|
||||
)
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
|
@ -136,7 +136,7 @@ class MistralModelPolicy(MistralPolicy):
|
|||
|
||||
def module_policy(self):
|
||||
if self.pipeline_stage_manager:
|
||||
warnings.warn("Mistral dosen't support pipeline parallelism now.")
|
||||
warnings.warn("Mistral doesn't support pipeline parallelism now.")
|
||||
|
||||
return super().module_policy()
|
||||
|
||||
|
@ -160,7 +160,7 @@ class MistralForCausalLMPolicy(MistralPolicy):
|
|||
}
|
||||
|
||||
if self.pipeline_stage_manager:
|
||||
warnings.warn("Mistral dosen't support pipeline parallelism now.")
|
||||
warnings.warn("Mistral doesn't support pipeline parallelism now.")
|
||||
|
||||
policy.update(new_item)
|
||||
|
||||
|
@ -186,7 +186,7 @@ class MistralForSequenceClassificationPolicy(MistralPolicy):
|
|||
}
|
||||
|
||||
if self.pipeline_stage_manager:
|
||||
warnings.warn("Mistral dosen't support pipeline parallelism now.")
|
||||
warnings.warn("Mistral doesn't support pipeline parallelism now.")
|
||||
|
||||
policy.update(new_item)
|
||||
return policy
|
||||
|
|
|
@ -59,7 +59,7 @@ class OPTPolicy(Policy):
|
|||
|
||||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
warnings.warn("OPT dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
warnings.warn("OPT doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
policy[OPTDecoder] = ModulePolicyDescription(
|
||||
|
|
|
@ -66,7 +66,7 @@ class T5BasePolicy(Policy):
|
|||
|
||||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
warnings.warn("T5 dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
warnings.warn("T5 doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
policy[T5Stack] = ModulePolicyDescription(
|
||||
|
@ -263,7 +263,7 @@ class T5BasePolicy(Policy):
|
|||
if num_decoder_layers == 0:
|
||||
return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages
|
||||
|
||||
# the number of stages distributed between encoder and decoder is optmized in this way:
|
||||
# the number of stages distributed between encoder and decoder is optimized in this way:
|
||||
# num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages))
|
||||
# s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1
|
||||
def objective(num_encoder_stages):
|
||||
|
|
|
@ -33,7 +33,7 @@ class ViTPolicy(Policy):
|
|||
|
||||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
warnings.warn("Vit dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
warnings.warn("Vit doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
policy[ViTEmbeddings] = ModulePolicyDescription(
|
||||
|
|
|
@ -69,13 +69,13 @@ class WhisperPolicy(Policy):
|
|||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
warnings.warn(
|
||||
"Whisper dosen't support sequence parallelism now, will ignore the sequence parallelism flag."
|
||||
"Whisper doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
|
||||
)
|
||||
|
||||
# TODO using the jit fused add_and_dropout affect the accuracy
|
||||
if self.shard_config.enable_jit_fused:
|
||||
self.shard_config.enable_jit_fused = False
|
||||
warnings.warn("Whisper dosen't support jit fused operator now, will ignore the jit fused operator flag.")
|
||||
warnings.warn("Whisper doesn't support jit fused operator now, will ignore the jit fused operator flag.")
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
policy[WhisperEncoderLayer] = ModulePolicyDescription(
|
||||
|
@ -302,7 +302,7 @@ class WhisperPolicy(Policy):
|
|||
if num_decoder_layers == 0:
|
||||
return Policy.distribute_layers(num_encoder_layers, num_stages), num_stages
|
||||
|
||||
# the number of stages distributed between encoder and decoder is optmized in this way:
|
||||
# the number of stages distributed between encoder and decoder is optimized in this way:
|
||||
# num_encoder_stages = argmin(abs(num_encoder_layers / encoder_stages - num_decoder_layers / decoder_stages))
|
||||
# s.t. num_encoder_stages + num_decoder_stages = num_stages, num_encoder_stages >= 1, num_decoder_stages >= 1
|
||||
def objective(num_encoder_stages):
|
||||
|
|
|
@ -43,7 +43,7 @@ class OpenMoePolicy(Policy):
|
|||
if self.shard_config.enable_sequence_parallelism:
|
||||
self.shard_config.enable_sequence_parallelism = False
|
||||
raise NotImplementedError(
|
||||
"openmoe dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
"openmoe doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
|
||||
|
||||
if self.shard_config.enable_tensor_parallelism:
|
||||
raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.")
|
||||
|
|
Loading…
Reference in New Issue