Skip to content

Embeddings

Embedding

Bases: Module

Transformer embeddings layer.

This implementation uses a randomly initialized embedding lookup table with dimension [vocab_size, d_model].

There's the possibility of loading pretrained embeddings from GloVe.

This choice has been made to achieve acceptable performances with low resources training and limited time training.

Note

GloVe embeddings available for English only.

Parameters:

Name Type Description Default
vocab_size int

Number of tokens in vocabulary.

required
d_model int | None

Model dimension. Defaults to None.

None
from_pretrained bool

Load embeddings from pretrained. Defaults to False.

False
pretrained_emb_type str | None

Type of pretrained embeddings. Defaults to None.

None
pretrained_emb_path str | None

Path of pretrained embeddings. Defaults to None.

None

Raises:

Type Description
EmbeddingDimError

Raised when the provided embedding dimension does not match the expected size.

EmbeddingTypePathError

Raised when the embedding type or file path is invalid or not found.

TokenizerNotSuppliedError

Raised when a tokenizer is required but has not been supplied.

VocabNotBuiltError

Raised when an operation requiring a built vocabulary is attempted before the vocabulary is constructed.

Source code in src/tfs_mt/embeddings.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
class Embedding(nn.Module):
    """Transformer embeddings layer.

    This implementation uses a randomly initialized embedding lookup table with dimension `[vocab_size, d_model]`.

    Note: There's the possibility of loading pretrained embeddings from GloVe.
        This choice has been made to achieve acceptable performances with low resources training and limited time training.

    Note:
        GloVe embeddings available for English only.

    Args:
        vocab_size (int): Number of tokens in vocabulary.
        d_model (int | None, optional): Model dimension. Defaults to None.
        from_pretrained (bool, optional): Load embeddings from pretrained. Defaults to False.
        pretrained_emb_type (str | None, optional): Type of pretrained embeddings. Defaults to None.
        pretrained_emb_path (str | None, optional): Path of pretrained embeddings. Defaults to None.

    Raises:
        EmbeddingDimError: Raised when the provided embedding dimension does not match the expected size.
        EmbeddingTypePathError: Raised when the embedding type or file path is invalid or not found.
        TokenizerNotSuppliedError: Raised when a tokenizer is required but has not been supplied.
        VocabNotBuiltError: Raised when an operation requiring a built vocabulary is attempted before the vocabulary is constructed.
    """

    def __init__(
        self,
        vocab_size: int,
        d_model: int | None = None,
        from_pretrained: bool = False,
        pretrained_emb_type: str | None = None,
        pretrained_emb_path: str | None = None,
        **kwargs,
    ):
        super().__init__()

        if d_model is None and not from_pretrained:
            raise EmbeddingDimError(d_model, from_pretrained)

        if from_pretrained:
            if d_model is not None:
                print(f"Ignoring d_model ({d_model}). The embeddings dim will be inferred from pretrained.")
            if pretrained_emb_type is None or pretrained_emb_path is None:
                raise EmbeddingTypePathError(from_pretrained, pretrained_emb_type, pretrained_emb_path)
            if pretrained_emb_type == "GloVe" and "tokenizer" not in kwargs:
                raise TokenizerNotSuppliedError()
            if kwargs["tokenizer"].vocab_size == 0:
                raise VocabNotBuiltError()

            if pretrained_emb_type == "GloVe":
                embeddings_dim, embeddings_lut = self.load_pretrained(
                    pretrained_emb_path, pretrained_emb_type, tokenizer=kwargs["tokenizer"]
                )
            else:
                embeddings_dim, embeddings_lut = self.load_pretrained(pretrained_emb_path, pretrained_emb_type)

            # The following operations consist in rescaling the norm of GloVe pretrained embeddings.
            # This is done to have source and target embeddings with a comparable norm to stabilize training.
            embeddings_lut_dummy = nn.Embedding(embeddings_lut.weight.shape[0], embeddings_lut.weight.shape[1])
            nn.init.xavier_uniform_(embeddings_lut_dummy.weight)

            embeddings_lut_dummy_norm = torch.mean(embeddings_lut_dummy.weight.data.norm(dim=1))
            embeddings_lut_norm = torch.mean(embeddings_lut.weight.data.norm(dim=1))

            rescale_coeff = embeddings_lut_dummy_norm / embeddings_lut_norm

            embeddings_lut.weight.data = embeddings_lut.weight.data * rescale_coeff

        else:
            embeddings_dim = d_model
            # Randomly initialized lookup table.
            embeddings_lut = nn.Embedding(vocab_size, embeddings_dim)
            nn.init.xavier_uniform_(embeddings_lut.weight)

        self.d_model = embeddings_dim
        self.scaling_factor = math.sqrt(self.d_model)
        self.embeddings_lut = embeddings_lut

    def load_pretrained(self, embeddings_path: str, emb_type: str = "GloVe", **kwargs) -> tuple[int, nn.Embedding]:
        """Loads pretrained GloVe embedding into the embedding lookup table."""
        if emb_type == "GloVe":
            tokenizer = kwargs["tokenizer"]

            if not os.path.isfile(embeddings_path):
                output_dir = "/".join(embeddings_path.split("/")[:-2])
                download_glove(output_dir, glove_version=embeddings_path.split("/")[-2])

            with open(embeddings_path, encoding="utf-8") as f:
                embeddings_dim = len(f.readline().strip().split()) - 1
            embeddings_lut = nn.Embedding(tokenizer.vocab_size, embeddings_dim)

            # NOTE The vocab extension with GloVe tokens is handled by the tokenizer.
            # Here GloVe token embeddings are mapped to the corresponding entry in the embeddings lookup table
            with open(embeddings_path, encoding="utf-8") as f:
                for line in f:
                    parts = line.strip().split()
                    idx, _ = tokenizer.encode(parts[0])
                    # If tokenization if non perfectly compatible with the GloVe one, idx can be a sequence of tokens longer then 3
                    # (Consider the first and last tokens coming out of tokenizer.encode are SOS_TOKEN and EOS_TOKEN)
                    # This is a useful check to avoid overwriting in the embeddings_lut matrix
                    if len(idx) > 3:
                        continue
                    idx = idx[1]  # The first token coming out of tokenizer.encode is SOS_TOKEN
                    if len(parts[1:]) != embeddings_dim:  # Skip unhandled tokens with spaces, eg. "1 3/4"
                        continue
                    try:
                        token_emb = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float32)
                    except ValueError:
                        continue
                    else:
                        embeddings_lut.weight.data[idx].copy_(token_emb)
        else:
            raise EmbeddingTypeNotImplementedError(emb_type)

        return embeddings_dim, embeddings_lut

    def forward(self, token_ids: Float[torch.Tensor, "B S"]) -> Float[torch.Tensor, "B S D"]:
        """Get token embeddings.

        Args:
            token_ids (Float[torch.Tensor, "B S"]): Input batch of token_ids. Where B is the batch size and S is the sequence length.

        Returns:
            Float[torch.Tensor, "B S D"]: Output batch of token embeddings. Where D is d_model.
        """

        embeddings = self.embeddings_lut(token_ids)

        # In the embedding layers, we multiply those weights by sqrt(d_model) (Attention is all you need page 5)
        embeddings = embeddings * self.scaling_factor

        return embeddings

forward(token_ids)

Get token embeddings.

Parameters:

Name Type Description Default
token_ids Float[Tensor, 'B S']

Input batch of token_ids. Where B is the batch size and S is the sequence length.

required

Returns:

Type Description
Float[Tensor, 'B S D']

Float[torch.Tensor, "B S D"]: Output batch of token embeddings. Where D is d_model.

Source code in src/tfs_mt/embeddings.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def forward(self, token_ids: Float[torch.Tensor, "B S"]) -> Float[torch.Tensor, "B S D"]:
    """Get token embeddings.

    Args:
        token_ids (Float[torch.Tensor, "B S"]): Input batch of token_ids. Where B is the batch size and S is the sequence length.

    Returns:
        Float[torch.Tensor, "B S D"]: Output batch of token embeddings. Where D is d_model.
    """

    embeddings = self.embeddings_lut(token_ids)

    # In the embedding layers, we multiply those weights by sqrt(d_model) (Attention is all you need page 5)
    embeddings = embeddings * self.scaling_factor

    return embeddings

load_pretrained(embeddings_path, emb_type='GloVe', **kwargs)

Loads pretrained GloVe embedding into the embedding lookup table.

Source code in src/tfs_mt/embeddings.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def load_pretrained(self, embeddings_path: str, emb_type: str = "GloVe", **kwargs) -> tuple[int, nn.Embedding]:
    """Loads pretrained GloVe embedding into the embedding lookup table."""
    if emb_type == "GloVe":
        tokenizer = kwargs["tokenizer"]

        if not os.path.isfile(embeddings_path):
            output_dir = "/".join(embeddings_path.split("/")[:-2])
            download_glove(output_dir, glove_version=embeddings_path.split("/")[-2])

        with open(embeddings_path, encoding="utf-8") as f:
            embeddings_dim = len(f.readline().strip().split()) - 1
        embeddings_lut = nn.Embedding(tokenizer.vocab_size, embeddings_dim)

        # NOTE The vocab extension with GloVe tokens is handled by the tokenizer.
        # Here GloVe token embeddings are mapped to the corresponding entry in the embeddings lookup table
        with open(embeddings_path, encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split()
                idx, _ = tokenizer.encode(parts[0])
                # If tokenization if non perfectly compatible with the GloVe one, idx can be a sequence of tokens longer then 3
                # (Consider the first and last tokens coming out of tokenizer.encode are SOS_TOKEN and EOS_TOKEN)
                # This is a useful check to avoid overwriting in the embeddings_lut matrix
                if len(idx) > 3:
                    continue
                idx = idx[1]  # The first token coming out of tokenizer.encode is SOS_TOKEN
                if len(parts[1:]) != embeddings_dim:  # Skip unhandled tokens with spaces, eg. "1 3/4"
                    continue
                try:
                    token_emb = torch.tensor([float(x) for x in parts[1:]], dtype=torch.float32)
                except ValueError:
                    continue
                else:
                    embeddings_lut.weight.data[idx].copy_(token_emb)
    else:
        raise EmbeddingTypeNotImplementedError(emb_type)

    return embeddings_dim, embeddings_lut

SinusoidalPositionalEncoding

Bases: Module

Sinusoidal Positional Encoding implementation from the original paper.

\[ \begin{align*} PE_{(pos,2i)} &= \sin(pos/10000^{2i/d_{model}}) \\ PE_{(pos,2i+1)} &= \cos(pos/10000^{2i/d_{model}}) \end{align*} \]

Parameters:

Name Type Description Default
d_model int

Model dimension.

required
dropout_prob float

Dropout probability. Defaults to 0.1.

0.1
max_sequence_length int

Max sequence length. Defaults to 128.

128
Source code in src/tfs_mt/embeddings.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
class SinusoidalPositionalEncoding(nn.Module):
    """Sinusoidal Positional Encoding implementation from the [original paper](https://arxiv.org/abs/1706.03762).

    $$
    \\begin{align*}
    PE_{(pos,2i)} &= \\sin(pos/10000^{2i/d_{model}}) \\\\
    PE_{(pos,2i+1)} &= \\cos(pos/10000^{2i/d_{model}})
    \\end{align*}
    $$

    Args:
        d_model (int): Model dimension.
        dropout_prob (float, optional): Dropout probability. Defaults to 0.1.
        max_sequence_length (int, optional): Max sequence length. Defaults to 128.
    """

    def __init__(self, d_model: int, dropout_prob: float = 0.1, max_sequence_length: int = 128):
        super().__init__()

        # Vector of all possible positions in the sequence [max_sequence_length, 1]
        position_id = torch.arange(0, max_sequence_length).unsqueeze(1)
        i_idx = torch.arange(0, d_model, 2, dtype=torch.float32) / d_model  # 2i / d_model
        freq_vec = torch.pow(10000.0, -i_idx)  # 1 / (10000^(2i/d_model))

        pe_lut = torch.zeros(max_sequence_length, d_model)  # Init positional encoding lookup table
        pe_lut[:, 0::2] = torch.sin(position_id * freq_vec)  # Assign sine on even positions
        pe_lut[:, 1::2] = torch.cos(position_id * freq_vec)  # Assing cosine on odd positions

        # Registering this weights as buffers so that they will be saved in model state_dict,
        # but they won't appear in model.parameters so that optimizer will not change them
        self.register_buffer("pe_lut", pe_lut)

        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, token_embeddings: Float[torch.Tensor, "B S D"]) -> Float[torch.Tensor, "B S D"]:
        """Get token embeddings with positional information.

        Args:
            token_embeddings (Float[torch.Tensor, "B S D"]): Input batch of token embeddings.

        Raises:
            IncompatibleEmbeddingsDimError: Raised when the input embedding dimension is invalid.

        Returns:
            Float[torch.Tensor, "B S D"]: Token embeddings with added positional information.
        """
        if token_embeddings.ndim != 3 or token_embeddings.size(-1) != self.pe_lut.shape[1]:
            raise IncompatibleEmbeddingsDimError(token_embeddings.shape)

        positional_encodings = self.pe_lut[: token_embeddings.size(1)]

        # we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks (Attention is all you need page 8)
        final_embedding = self.dropout(token_embeddings + positional_encodings)

        return final_embedding

forward(token_embeddings)

Get token embeddings with positional information.

Parameters:

Name Type Description Default
token_embeddings Float[Tensor, 'B S D']

Input batch of token embeddings.

required

Raises:

Type Description
IncompatibleEmbeddingsDimError

Raised when the input embedding dimension is invalid.

Returns:

Type Description
Float[Tensor, 'B S D']

Float[torch.Tensor, "B S D"]: Token embeddings with added positional information.

Source code in src/tfs_mt/embeddings.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def forward(self, token_embeddings: Float[torch.Tensor, "B S D"]) -> Float[torch.Tensor, "B S D"]:
    """Get token embeddings with positional information.

    Args:
        token_embeddings (Float[torch.Tensor, "B S D"]): Input batch of token embeddings.

    Raises:
        IncompatibleEmbeddingsDimError: Raised when the input embedding dimension is invalid.

    Returns:
        Float[torch.Tensor, "B S D"]: Token embeddings with added positional information.
    """
    if token_embeddings.ndim != 3 or token_embeddings.size(-1) != self.pe_lut.shape[1]:
        raise IncompatibleEmbeddingsDimError(token_embeddings.shape)

    positional_encodings = self.pe_lut[: token_embeddings.size(1)]

    # we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks (Attention is all you need page 8)
    final_embedding = self.dropout(token_embeddings + positional_encodings)

    return final_embedding