From 0f5f5dd556f4e4755934e550a5f8f2fbaadb1174 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 16 Mar 2022 14:23:43 +0800
Subject: [PATCH] fixed gpt attention mask in pipeline (#430)

---
 model_zoo/gpt/gpt.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/model_zoo/gpt/gpt.py b/model_zoo/gpt/gpt.py
index b5413f6b8..d544f9d71 100644
--- a/model_zoo/gpt/gpt.py
+++ b/model_zoo/gpt/gpt.py
@@ -51,18 +51,6 @@ class GPTEmbedding(nn.Module):
             x = x + self.tokentype_embeddings(tokentype_ids)
         x = self.dropout(x)
 
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # Adapted from huggingface
-        if attention_mask is not None:
-            batch_size = input_ids.shape[0]
-            attention_mask = attention_mask.view(batch_size, -1)
-            attention_mask = col_nn.partition_batch(attention_mask)
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            attention_mask = attention_mask.to(dtype=x.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * -10000.0
-
         return x, attention_mask
 
 
@@ -355,6 +343,21 @@ class PipelineGPT(nn.Module):
         if self.first:
             x, attention_mask = self.embed(input_ids, attention_mask)
 
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # Adapted from huggingface
+        if attention_mask is not None:
+            if self.first:
+                batch_size = input_ids.shape[0]
+            else:
+                batch_size = x.shape[0]
+            attention_mask = attention_mask.view(batch_size, -1)
+            attention_mask = col_nn.partition_batch(attention_mask)
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = attention_mask.to(dtype=x.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+
         for block in self.blocks:
             x, attention_mask = block(x, attention_mask)