More documentation for transformer decoder, default alignment_heads (#1692)

Zenglinxiao · francoishernandez · commit f44d1d2550c0 · 2019-12-31T17:02:51.000+01:00
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
@@ -445,3 +445,23 @@ @inproceedings{garg2019jointly
   url = {https://arxiv.org/abs/1909.02074},
   year = {2019},
 }
+
+@inproceedings{DeeperTransformer,
+    title = "Learning Deep Transformer Models for Machine Translation",
+    author = "Wang, Qiang  and
+      Li, Bei  and
+      Xiao, Tong  and
+      Zhu, Jingbo  and
+      Li, Changliang  and
+      Wong, Derek F.  and
+      Chao, Lidia S.",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P19-1176",
+    doi = "10.18653/v1/P19-1176",
+    pages = "1810--1822",
+    abstract = "Transformer is the state-of-the-art model in recent machine translation evaluations. Two strands of research are promising to improve models of this kind: the first uses wide networks (a.k.a. Transformer-Big) and has been the de facto standard for development of the Transformer system, and the other uses deeper language representation but faces the difficulty arising from learning deep networks. Here, we continue the line of research on the latter. We claim that a truly deep Transformer model can surpass the Transformer-Big counterpart by 1) proper use of layer normalization and 2) a novel way of passing the combination of previous layers to the next. On WMT{'}16 English-German and NIST OpenMT{'}12 Chinese-English tasks, our deep system (30/25-layer encoder) outperforms the shallow Transformer-Big/Base baseline (6-layer encoder) by 0.4-2.4 BLEU points. As another bonus, the deep model is 1.6X smaller in size and 3X faster in training than Transformer-Big.",
+}
diff --git a/onmt/decoders/transformer.py b/onmt/decoders/transformer.py
@@ -12,21 +12,46 @@
 
 
 class TransformerDecoderLayer(nn.Module):
-    """
+    """Transformer Decoder layer block in Pre-Norm style.
+    Pre-Norm style is an improvement w.r.t. Original paper's Post-Norm style,
+    providing better converge speed and performance. This is also the actual
+    implementation in tensor2tensor and also avalable in fairseq.
+    See https://tunz.kr/post/4 and :cite:`DeeperTransformer`.
+
+    .. mermaid::
+
+        graph LR
+        %% "*SubLayer" can be self-attn, src-attn or feed forward block
+            A(input) --> B[Norm]
+            B --> C["*SubLayer"]
+            C --> D[Drop]
+            D --> E((+))
+            A --> E
+            E --> F(out)
+
+
     Args:
-      d_model (int): the dimension of keys/values/queries in
-          :class:`MultiHeadedAttention`, also the input size of
-          the first-layer of the :class:`PositionwiseFeedForward`.
-      heads (int): the number of heads for MultiHeadedAttention.
-      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
-      dropout (float): dropout probability.
-      self_attn_type (string): type of self-attention scaled-dot, average
+        d_model (int): the dimension of keys/values/queries in
+            :class:`MultiHeadedAttention`, also the input size of
+            the first-layer of the :class:`PositionwiseFeedForward`.
+        heads (int): the number of heads for MultiHeadedAttention.
+        d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
+        dropout (float): dropout in residual, self-attn(dot) and feed-forward
+        attention_dropout (float): dropout in context_attn (and self-attn(avg))
+        self_attn_type (string): type of self-attention scaled-dot, average
+        max_relative_positions (int):
+            Max distance between inputs in relative positions representations
+        aan_useffn (bool): Turn on the FFN layer in the AAN decoder
+        full_context_alignment (bool):
+            whether enable an extra full context decoder forward for alignment
+        alignment_heads (int):
+            N. of cross attention heads to use for alignment guiding
     """
 
     def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
                  self_attn_type="scaled-dot", max_relative_positions=0,
                  aan_useffn=False, full_context_alignment=False,
-                 alignment_heads=None):
+                 alignment_heads=0):
         super(TransformerDecoderLayer, self).__init__()
 
         if self_attn_type == "scaled-dot":
@@ -48,10 +73,10 @@ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
         self.alignment_heads = alignment_heads
 
     def forward(self, *args, **kwargs):
-        """ Extend _forward for (possibly) multiple decoder pass:
-        1. Always a default (future masked) decoder forward pass,
-        2. Possibly a second future aware decoder pass for joint learn
-            full context alignement.
+        """ Extend `_forward` for (possibly) multiple decoder pass:
+        Always a default (future masked) decoder forward pass,
+        Possibly a second future aware decoder pass for joint learn
+        full context alignement, :cite:`garg2019jointly`.
 
         Args:
             * All arguments of _forward.
@@ -60,9 +85,9 @@ def forward(self, *args, **kwargs):
         Returns:
             (FloatTensor, FloatTensor, FloatTensor or None):
 
-            * output ``(batch_size, 1, model_dim)``
-            * top_attn ``(batch_size, 1, src_len)``
-            * attn_align ``(batch_size, 1, src_len)`` or None
+            * output ``(batch_size, T, model_dim)``
+            * top_attn ``(batch_size, T, src_len)``
+            * attn_align ``(batch_size, T, src_len)`` or None
         """
         with_align = kwargs.pop('with_align', False)
         output, attns = self._forward(*args, **kwargs)
@@ -73,7 +98,7 @@ def forward(self, *args, **kwargs):
                 # return _, (B, Q_len, K_len)
                 _, attns = self._forward(*args, **kwargs, future=True)
 
-            if self.alignment_heads is not None:
+            if self.alignment_heads > 0:
                 attns = attns[:, :self.alignment_heads, :, :].contiguous()
             # layer average attention across heads, get ``(B, Q, K)``
             # Case 1: no full_context, no align heads -> layer avg baseline
@@ -85,18 +110,23 @@ def forward(self, *args, **kwargs):
     def _forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask,
                  layer_cache=None, step=None, future=False):
         """ A naive forward pass for transformer decoder.
-        # TODO: change 1 to T as T could be 1 or tgt_len
+
+        # T: could be 1 in the case of stepwise decoding or tgt_len
+
         Args:
-            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
+            inputs (FloatTensor): ``(batch_size, T, model_dim)``
             memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
             src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
-            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``
+            tgt_pad_mask (LongTensor): ``(batch_size, 1, T)``
+            layer_cache (dict or None): cached layer info when stepwise decode
+            step (int or None): stepwise decoding counter
+            future (bool): If set True, do not apply future_mask.
 
         Returns:
             (FloatTensor, FloatTensor):
 
-            * output ``(batch_size, 1, model_dim)``
-            * attns ``(batch_size, head, 1, src_len)``
+            * output ``(batch_size, T, model_dim)``
+            * attns ``(batch_size, head, T, src_len)``
 
         """
         dec_mask = None
@@ -166,22 +196,31 @@ class TransformerDecoder(DecoderBase):
 
 
     Args:
-       num_layers (int): number of encoder layers.
-       d_model (int): size of the model
-       heads (int): number of heads
-       d_ff (int): size of the inner FF layer
-       copy_attn (bool): if using a separate copy attention
-       self_attn_type (str): type of self-attention scaled-dot, average
-       dropout (float): dropout parameters
-       embeddings (onmt.modules.Embeddings):
-          embeddings to use, should have positional encodings
+        num_layers (int): number of encoder layers.
+        d_model (int): size of the model
+        heads (int): number of heads
+        d_ff (int): size of the inner FF layer
+        copy_attn (bool): if using a separate copy attention
+        self_attn_type (str): type of self-attention scaled-dot, average
+        dropout (float): dropout in residual, self-attn(dot) and feed-forward
+        attention_dropout (float): dropout in context_attn (and self-attn(avg))
+        embeddings (onmt.modules.Embeddings):
+            embeddings to use, should have positional encodings
+        max_relative_positions (int):
+            Max distance between inputs in relative positions representations
+        aan_useffn (bool): Turn on the FFN layer in the AAN decoder
+        full_context_alignment (bool):
+            whether enable an extra full context decoder forward for alignment
+        alignment_layer (int): N° Layer to supervise with for alignment guiding
+        alignment_heads (int):
+            N. of cross attention heads to use for alignment guiding
     """
 
     def __init__(self, num_layers, d_model, heads, d_ff,
                  copy_attn, self_attn_type, dropout, attention_dropout,
                  embeddings, max_relative_positions, aan_useffn,
                  full_context_alignment, alignment_layer,
-                 alignment_heads=None):
+                 alignment_heads):
         super(TransformerDecoder, self).__init__()
 
         self.embeddings = embeddings
diff --git a/onmt/opts.py b/onmt/opts.py
@@ -162,7 +162,7 @@ def model_opts(parser):
                    "https://arxiv.org/abs/1909.02074")
     group.add('--alignment_layer', '-alignment_layer', type=int, default=-3,
               help='Layer number which has to be supervised.')
-    group.add('--alignment_heads', '-alignment_heads', type=int, default=None,
+    group.add('--alignment_heads', '-alignment_heads', type=int, default=0,
               help='N. of cross attention heads per layer to supervised with')
     group.add('--full_context_alignment', '-full_context_alignment',
               action="store_true",