From 0f5f5dd556f4e4755934e550a5f8f2fbaadb1174 Mon Sep 17 00:00:00 2001 From: Frank Lee Date: Wed, 16 Mar 2022 14:23:43 +0800 Subject: [PATCH] fixed gpt attention mask in pipeline (#430) --- model_zoo/gpt/gpt.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/model_zoo/gpt/gpt.py b/model_zoo/gpt/gpt.py index b5413f6b8..d544f9d71 100644 --- a/model_zoo/gpt/gpt.py +++ b/model_zoo/gpt/gpt.py @@ -51,18 +51,6 @@ class GPTEmbedding(nn.Module): x = x + self.tokentype_embeddings(tokentype_ids) x = self.dropout(x) - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # Adapted from huggingface - if attention_mask is not None: - batch_size = input_ids.shape[0] - attention_mask = attention_mask.view(batch_size, -1) - attention_mask = col_nn.partition_batch(attention_mask) - attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * -10000.0 - return x, attention_mask @@ -355,6 +343,21 @@ class PipelineGPT(nn.Module): if self.first: x, attention_mask = self.embed(input_ids, attention_mask) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # Adapted from huggingface + if attention_mask is not None: + if self.first: + batch_size = input_ids.shape[0] + else: + batch_size = x.shape[0] + attention_mask = attention_mask.view(batch_size, -1) + attention_mask = col_nn.partition_batch(attention_mask) + attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * -10000.0 + for block in self.blocks: x, attention_mask = block(x, attention_mask)