GPT-2 Fix

michaelbenayoun · michaelbenayoun · commit a582a465f1d8 · 2022-11-08T15:45:20.000+01:00
diff --git a/optimum/graphcore/fx/transformation_manager.py b/optimum/graphcore/fx/transformation_manager.py
@@ -16,6 +16,7 @@
 
 import copy
 import functools
+import operator
 from typing import Iterator, List, Tuple, Union
 
 import torch
@@ -123,6 +124,6 @@ def compose_reversible_transformations(self, optimization_level: int) -> Reversi
     (1, MergeLinears()),
     # (1, FuseBiasInLinear()),
     # Those change the computation, but are actually needed for fp16 stability.
-    (0, ClipValuesSymmetric(1e4, exclude_targets=("view",))),
+    (0, ClipValuesSymmetric(1e4, include_targets=(torch.add, torch.mul, operator.add, operator.mul))),
     (0, ClipValues(1e-4, float("inf"), include_targets=(torch.nn.LayerNorm,))),
 )
diff --git a/optimum/graphcore/fx/transformations.py b/optimum/graphcore/fx/transformations.py
@@ -204,7 +204,9 @@ def __init__(
     ):
         if clip_value < 0:
             raise ValueError(f"The provided clip value must be equal or greater than 0, but here {clip_value}.")
-        return super().__init__(-clip_value, clip_value, exclude_targets=exclude_targets)
+        return super().__init__(
+            -clip_value, clip_value, include_targets=include_targets, exclude_targets=exclude_targets
+        )
 
 
 class OutlineAttribute(ReversibleTransformation):
@@ -406,7 +408,9 @@ def sort_nodes_function(node):
 
         embedding_node = max(embedding_nodes, key=sort_nodes_function)
         if embedding_node.op == "call_function":
-            raise NotImplementedError("VocabEmbeddingToSerializedEmbedding does not support torch.nn.functional.embedding yet.")
+            raise NotImplementedError(
+                "VocabEmbeddingToSerializedEmbedding does not support torch.nn.functional.embedding yet."
+            )
 
         split = embedding_node.target.rsplit(".", maxsplit=1)
         if len(split) == 1:
@@ -520,7 +524,11 @@ def transform(self, graph_module: "GraphModule") -> "GraphModule":
 
 
 class ShareEmbeddingComputation(Transformation):
-    def __init__(self, name_regex: Optional[str] = None, allowed_embedding_classes: Union[Tuple[Type], Type] = (torch.nn.Embedding, SerializedEmbedding)):
+    def __init__(
+        self,
+        name_regex: Optional[str] = None,
+        allowed_embedding_classes: Union[Tuple[Type], Type] = (torch.nn.Embedding, SerializedEmbedding),
+    ):
         self.name_regex = re.compile(name_regex) if name_regex else None
         self.allowed_embedding_classes = allowed_embedding_classes
         if not isinstance(self.allowed_embedding_classes, tuple):
diff --git a/optimum/graphcore/fx/utils.py b/optimum/graphcore/fx/utils.py
@@ -36,7 +36,6 @@
 
 # TODO: keep this until transformers >= 4.23.2
 class GCProxy(HFProxy):
-
     @property
     def dtype(self):
         return self.__getattr__("dtype")
diff --git a/optimum/graphcore/models/deberta/modeling_deberta.py b/optimum/graphcore/models/deberta/modeling_deberta.py
@@ -43,11 +43,11 @@
     DEFAULT_TRANSFORMATION_MANAGER,
     AddPoptorchBlock,
     AddPoptorchBlocksInSeries,
+    LinearToSerializedLinear,
     OutlineAttribute,
     RecomputationCheckpoint,
-    VocabEmbeddingToSerializedEmbedding,
-    LinearToSerializedLinear,
     TieWeights,
+    VocabEmbeddingToSerializedEmbedding,
     symbolic_trace_pipelined_model,
 )
 from ...modeling_utils import OnehotGather, PipelineMixin, get_layer_ipu, register
diff --git a/optimum/graphcore/models/gpt2/modeling_gpt2.py b/optimum/graphcore/models/gpt2/modeling_gpt2.py
@@ -37,6 +37,7 @@
     symbolic_trace_pipelined_model,
 )
 from ...modeling_utils import PipelineMixin, get_layer_ipu, register
+from .optimized_gpt2_attn import OptimizedGPT2Attention
 
 
 logger = logging.get_logger(__name__)
@@ -69,7 +70,7 @@ def get_transformations(self):
         layer_ipu = get_layer_ipu(self.ipu_config.layers_per_ipu)
         transformations = [
             AddPoptorchBlock("Token Embedding", 0, "transformer.wte", log_insertions=log_insertions),
-            AddPoptorchBlock("Position Embedding", 1, "transformer.wtp", log_insertions=log_insertions),
+            AddPoptorchBlock("Position Embedding", 0, "transformer.wpe", log_insertions=log_insertions),
             OutlineAttribute("transformer.ln_f", "LayerNorm"),
             AddPoptorchBlocksInSeries("Layer", layer_ipu, r"transformer.h.[0-9]+", log_insertions=log_insertions),
             # Only one of the following AddPoptorchBlock, will actually add a block.
@@ -84,7 +85,7 @@ def get_transformations(self):
                 )
             )
         if self.ipu_config.embedding_serialization_factor > 1:
-            transformations.append(VocabEmbeddingToSerializedEmbedding())
+            transformations.append(VocabEmbeddingToSerializedEmbedding("transformer.wte"))
 
         return transformations
 
@@ -96,6 +97,9 @@ def parallelize(self):
         - Adds recomputation checkpoints
         """
         PipelineMixin.parallelize(self)
+        if not isinstance(self, torch.fx.GraphModule):
+            for layer in self.transformer.h:
+                layer.attn.__class__ = OptimizedGPT2Attention
         if self.ipu_config.embedding_serialization_factor > 1:
             self.resize_vocab(False)
         traced = symbolic_trace_pipelined_model(self)
@@ -137,7 +141,7 @@ def get_transformations(self):
         layer_ipu = get_layer_ipu(self.ipu_config.layers_per_ipu)
         transformations = [
             AddPoptorchBlock("Token Embedding", 0, "transformer.wte", log_insertions=log_insertions),
-            AddPoptorchBlock("Position Embedding", 1, "transformer.wtp", log_insertions=log_insertions),
+            AddPoptorchBlock("Position Embedding", 0, "transformer.wpe", log_insertions=log_insertions),
             OutlineAttribute("transformer.ln_f", "LayerNorm"),
             AddPoptorchBlocksInSeries("Layer", layer_ipu, r"transformer.h.[0-9]+", log_insertions=log_insertions),
             AddPoptorchBlock("LM Head", 0, "lm_head", log_insertions=log_insertions),