Skip to content

Data utils

TranslationDataset

Bases: Dataset

Translation Dataset.

Parameters:

Name Type Description Default
src_texts list[str]

List of source texts.

required
tgt_texts list[str]

List of target texts.

required
src_tokenizer BaseTokenizer

Tokenizer used to preprocess the source language text.

required
tgt_tokenizer BaseTokenizer

Tokenizer used to preprocess the target language text.

required
src_lang str

Identifier for the source language, e.g., "en" for English.

required
tgt_lang str

Identifier for the target language, e.g., "it" for Italian.

required
max_sequence_length int | None

Maximum sequence length for tokenization. If None, sequences are not truncated. Defaults to None.

None
Source code in src/tfs_mt/data_utils.py
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
class TranslationDataset(Dataset):
    """Translation Dataset.

    Args:
        src_texts (list[str]): List of source texts.
        tgt_texts (list[str]): List of target texts.
        src_tokenizer (BaseTokenizer): Tokenizer used to preprocess the source language text.
        tgt_tokenizer (BaseTokenizer): Tokenizer used to preprocess the target language text.
        src_lang (str): Identifier for the source language, e.g., `"en"` for English.
        tgt_lang (str): Identifier for the target language, e.g., `"it"` for Italian.
        max_sequence_length (int | None, optional): Maximum sequence length for tokenization. If None, sequences are not truncated. Defaults to None.
    """

    def __init__(
        self,
        src_texts: list[str],
        tgt_texts: list[str],
        src_tokenizer: BaseTokenizer,
        tgt_tokenizer: BaseTokenizer,
        src_lang: str,
        tgt_lang: str,
        max_sequence_length: int | None = None,
        **kwargs,
    ):
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.max_sequence_length = max_sequence_length

        # Build tokenizers vocab if empty
        if self.src_tokenizer.vocab_size == 0 or self.tgt_tokenizer.vocab_size == 0:
            if "extend_vocab_with_glove" in kwargs and "glove_version" in kwargs:
                self._build_vocabs(
                    vocab_min_freq=kwargs.get("vocab_min_freq", 2),
                    extend_with_glove=kwargs.get("extend_vocab_with_glove", True),
                    glove_version=kwargs.get("glove_version", "glove.2024.wikigiga.50d"),
                )
            else:
                self._build_vocabs(kwargs.get("vocab_min_freq", 2))

        # Filter input data excluding texts longer than max_sequence_length
        if max_sequence_length is not None and max_sequence_length > 0:
            print(f"Max sequence length set to {max_sequence_length}.")
            self.src_texts, self.tgt_texts = [], []
        else:
            self.src_texts = src_texts
            self.tgt_texts = tgt_texts

        # Prepare tokenized texts here to have them ready in __getitem__ method
        # This will speed up the batch preparation in the dataloader but it will raise system memory usage since the tokenized sequences are cached as dataset attribute
        # (The masks are also cached but their impact is negligible since they are arrays of boolean values)
        print("Caching encoded sequences in memory, this may take some time...")
        self.src_encoded_sequences, self.tgt_encoded_sequences = [], []
        self.src_masks, self.tgt_masks = [], []
        for src_text, tgt_text in zip(src_texts, tgt_texts, strict=False):
            src_text_tokenized = src_tokenizer.tokenize(src_text)
            tgt_text_tokenized = tgt_tokenizer.tokenize(tgt_text)

            # Exclude sequences that exceed max_sequence_length
            if max_sequence_length is not None and max_sequence_length > 0:
                if (
                    len(src_text_tokenized) > max_sequence_length - 2
                ):  # -2 accounts for SOS and EOS tokens that will be added in the __getitem__ method
                    continue
                if len(tgt_text_tokenized) > max_sequence_length - 2:
                    continue
                self.src_texts.append(src_text)
                self.tgt_texts.append(tgt_text)

            # src and tgt sequence lengths must be the same to properly compute cross attention
            # The smaller sequence will be padded to the length of the longer sequence
            # Attention mask ensure no attention is computed with pad tokens
            max_seq_len = max(len(src_text_tokenized), len(tgt_text_tokenized))

            # Tokenize texts
            src_tokens, src_mask = self.src_tokenizer.encode(src_text_tokenized, pad_to_len=max_seq_len)
            tgt_tokens, tgt_mask = self.tgt_tokenizer.encode(tgt_text_tokenized, pad_to_len=max_seq_len)

            self.src_encoded_sequences.append(src_tokens)
            self.tgt_encoded_sequences.append(tgt_tokens)
            self.src_masks.append(src_mask)
            self.tgt_masks.append(tgt_mask)

    def _build_vocabs(self, vocab_min_freq: int = 2, extend_with_glove: bool = False, **kwargs) -> None:
        """Build vocabularies for tokenizers."""

        print("Building vocabs, it may take a few minutes...")

        # Provides lists of tokens. Here the lists are not converted to sets cause the tokenizer may need the token frequencies
        src_tokens = [token for text in self.src_texts for token in self.src_tokenizer.tokenize(text)]
        tgt_tokens = [token for text in self.tgt_texts for token in self.tgt_tokenizer.tokenize(text)]

        self.src_tokenizer.build_vocab(
            src_tokens,
            min_freq=vocab_min_freq,
            extend_with_glove=bool(
                extend_with_glove and self.src_lang == "en"
            ),  # GloVe is trained on english only datasets so it doesn't make sense to extend non english vocabs
            glove_version=kwargs.get("glove_version", "glove.2024.wikigiga.50d"),
        )
        self.tgt_tokenizer.build_vocab(
            tgt_tokens,
            min_freq=vocab_min_freq,
            extend_with_glove=bool(extend_with_glove and self.tgt_lang == "en"),
            glove_version=kwargs.get("glove_version", "glove.2024.wikigiga.50d"),
        )

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor | str]:
        src_tokens = self.src_encoded_sequences[idx]
        tgt_tokens = self.tgt_encoded_sequences[idx]
        src_mask = self.src_masks[idx]
        tgt_mask = self.tgt_masks[idx]
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]

        return {
            "src": torch.tensor(src_tokens, dtype=torch.long),
            "tgt": torch.tensor(tgt_tokens, dtype=torch.long),
            "src_mask": torch.tensor(src_mask, dtype=torch.bool),
            "tgt_mask": torch.tensor(tgt_mask, dtype=torch.bool),
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

WordTokenizer

Bases: BaseTokenizer

Word tokenizer. Mainly used to let the model be compatible with pretrained GloVe embeddings.

Parameters:

Name Type Description Default
special_tokens dict[str, str] | None

Special tokens to be considered, eg. SOS_TOKEN, EOS_TOKEN. Defaults to None.

None
contractions list[str] | None

Contractions to be considered, eg. 's, 'll. If None the following set of contractions will be considered: 's, 're, 've, 'm, 'll, 'd, n't, 't, n'ts. Defaults to None.

None
tokenizer_max_len int

Tokenizer max sequence length. Mainly used to limit memory usage and performance impact during training and inference due to attention quadratic complexity. Defaults to 128.

128
max_vocab_size int

Maximum number of token in vocabulary. Defaults to 100_000.

100000
Source code in src/tfs_mt/data_utils.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
class WordTokenizer(BaseTokenizer):
    """Word tokenizer.
    Mainly used to let the model be compatible with pretrained GloVe embeddings.

    Args:
        special_tokens (dict[str, str] | None, optional): Special tokens to be considered, eg. SOS_TOKEN, EOS_TOKEN. Defaults to None.
        contractions (list[str] | None, optional): Contractions to be considered, eg. 's, 'll. If None the following set of contractions will be considered: `'s`, `'re`, `'ve`, `'m`, `'ll`, `'d`, `n't`, `'t`, `n'ts`. Defaults to None.
        tokenizer_max_len (int, optional): Tokenizer max sequence length. Mainly used to limit memory usage and performance impact during training and inference due to attention quadratic complexity. Defaults to 128.
        max_vocab_size (int, optional): Maximum number of token in vocabulary. Defaults to 100_000.
    """

    def __init__(
        self,
        special_tokens: dict[str, str] | None = None,
        contractions: list[str, str] | None = None,
        tokenizer_max_len: int = 128,
        max_vocab_size: int = 100_000,
    ):
        self.vocab: dict[str, int] = {}
        self.vocab_reverse: dict[int, str] = {}  # Useful for efficient decoding

        self.tokenizer_max_len = tokenizer_max_len
        self.max_vocab_size = max_vocab_size

        self.special_tokens = special_tokens or {
            "sos_token": "<s>",
            "eos_token": "</s>",
            "pad_token": "<PAD>",
            "unk_token": "<UNK>",
        }

        # Setup contractions management utilities
        contractions_init = contractions or ["'s", "'re", "'ve", "'m", "'ll", "'d", "n't", "'t", "n'ts"]

        # Prepare a dict of the following type to address the detection of nested contractions:
        # nested_contractions_dict = {
        #     "ANYTHING_n't_ANYTHING": ["n't"],
        #     "n't": ["'t"]
        # }
        # NOTE "'t" is technically a sub contraction of both "ANYTHING_n't_ANYTHING" and "n't" but it will be only considered as sub contraction of n't
        self.nested_contractions_dict: dict[str, list[str]] = {}

        for contraction in contractions_init:
            other_contractions = contractions_init.copy()
            other_contractions.remove(contraction)
            for comp_contraction in other_contractions:
                if contraction in comp_contraction:  # Detect sub contraction
                    # Create new mapping in dict
                    if comp_contraction not in self.nested_contractions_dict:
                        self.nested_contractions_dict[comp_contraction] = [contraction]
                    # Detect sub contraction already present in the dict
                    elif comp_contraction in self.nested_contractions_dict and contraction not in list(
                        chain.from_iterable(self.nested_contractions_dict.values())
                    ):
                        # Merge all elements in sub contractions lists in a single list
                        self.nested_contractions_dict[comp_contraction].append(contraction)

        # Final sets of sub contractions and non-nested contractions
        # NOTE the plain list of all sub contractions will be used during tokenizationin order to better detect correct splitting when the `'` is encountered
        self.all_sub_contractions = list(chain.from_iterable(self.nested_contractions_dict.values()))
        self.contractions = [c for c in contractions_init if c not in self.all_sub_contractions]

        self.glove_available_versions = [
            "glove.2024.dolma.300d",
            "glove.2024.wikigiga.300d",
            "glove.2024.wikigiga.200d",
            "glove.2024.wikigiga.100d",
            "glove.2024.wikigiga.50d",
            "glove.42B.300d",
            "glove.6B",
            "glove.840B.300d",
            "glove.twitter.27B",
        ]

    @property
    def vocab_size(self):
        return len(self.vocab.items()) if self.vocab else 0

    @property
    def sos_token_idx(self):
        return self.vocab.get(self.special_tokens["sos_token"], 0)

    @property
    def eos_token_idx(self):
        return self.vocab.get(self.special_tokens["eos_token"], 1)

    @property
    def pad_token_idx(self):
        return self.vocab.get(self.special_tokens["pad_token"], 2)

    @property
    def unk_token_idx(self):
        return self.vocab.get(self.special_tokens["unk_token"], 3)

    @classmethod
    def from_pretrained(cls: type["WordTokenizer"], path: str) -> "WordTokenizer":
        if path.startswith(("https://", "http://")):
            try:
                response = requests.get(path, timeout=100)
                response.raise_for_status()
                data = response.json()
            except Exception as e:
                raise RuntimeError(f"Failed to download tokenizer from {path}: {e}") from e
        else:
            if os.path.exists(path):
                with open(path, encoding="utf-8") as f:
                    data = json.load(f)

        if data.get("vocab") is None:
            raise CheckpointEmptyVocabException()

        tokenizer = cls(special_tokens=data.get("special_tokens", {}), contractions=data.get("contractions", {}))
        tokenizer.vocab = data.get("vocab")
        tokenizer.vocab_reverse = {idx: token for token, idx in tokenizer.vocab.items()}

        return tokenizer

    def to_json(self, output_path: str) -> None:
        if self.vocab_size == 0:
            raise VocabNotBuiltError()

        to_save = {
            "type": "WordTokenizer",
            "special_tokens": self.special_tokens,
            "contractions": self.contractions,
            "vocab": self.vocab,
        }

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(to_save, f, ensure_ascii=False, indent=2)

    def build_vocab(
        self,
        tokens: list[str],
        min_freq: int = 2,
        extend_with_glove: bool = False,
        glove_version: str = "glove.2024.wikigiga.50d",
        **kwargs,
    ) -> None:
        """Build vocabulary method.

        Args:
            tokens (list[str]): Tokens from dataset to build vocabulary on.
            min_freq (int, optional): Minimum number of times a token has to appear in the dataset to be included in the vocabulary. Defaults to 2.
            extend_with_glove (bool, optional): Enable vocabulary extension with GloVe tokens. Defaults to False.
            glove_version (str, optional): GloVe version to use if `extend_with_glove` is `True`. Defaults to "glove.2024.wikigiga.50d".

        Raises:
            GloVeVersionError: Raised when supplied glove_version is unavailable.
        """
        vocab_list = []
        vocab_list.extend(self.special_tokens.values())
        vocab_set = set(vocab_list)

        if min_freq > 1:
            token_counts = Counter(tokens)
            for token, count in token_counts.items():
                # len(vocab_set) is O(1) operation cause the length is stored as a set attribute
                if (count >= min_freq and self.max_vocab_size == -1) or (
                    count >= min_freq and self.max_vocab_size > 0 and len(vocab_set) < self.max_vocab_size
                ):
                    vocab_set.add(token.lower())
        else:
            for token in tokens:
                if self.max_vocab_size == -1 or (self.max_vocab_size > 0 and len(vocab_set) < self.max_vocab_size):
                    vocab_set.add(token.lower())

        vocab_list = list(vocab_set)
        del vocab_set

        if extend_with_glove and (
            self.max_vocab_size == -1 or (self.max_vocab_size > 0 and len(vocab_list) < self.max_vocab_size)
        ):
            print("Extending vocab with GloVe tokens...")

            if glove_version not in self.glove_available_versions:
                raise GloVeVersionError(glove_version, self.glove_available_versions)

            data_path = os.getcwd() + "/data" if "data_path" not in kwargs else kwargs["data_path"]

            glove_tokens = []

            try:
                glove_filepath = download_glove(data_path, glove_version)

                print(f"Loading GloVe {glove_version} tokens from file...")

                with open(glove_filepath, encoding="utf-8") as f:
                    lines = f.readlines()

                # Parse GloVe tokens
                for line in lines:
                    parts = line.strip().split()
                    try:
                        float(parts[1])
                    except ValueError:
                        continue
                    else:
                        token = parts[0].lower()
                        glove_tokens.append(token)

                initial_size = len(vocab_list)
                if self.max_vocab_size != -1:
                    for tok in glove_tokens:
                        if tok not in vocab_list:
                            vocab_list.append(tok)
                        if len(vocab_list) == self.max_vocab_size:
                            break
                else:
                    vocab_list.extend(glove_tokens)
                    vocab_list = list(set(vocab_list))

                print(f"Added {len(vocab_list) - initial_size} tokens from GloVe")

            except Exception as e:
                print(f"Error with GloVe processing GloVe: {e}")

        # Create mappings
        self.vocab = {token: idx for idx, token in enumerate(vocab_list)}
        self.vocab_reverse = dict(enumerate(vocab_list))

        del vocab_list

        print(f"Built vocabulary with {len(self.vocab.items())} tokens.")

    def tokenize(self, text: str) -> list[str]:
        """Tokenizer based on GloVe word tokenizer in order to let the model be compatible with GloVe pretrained embeddings.

        Note:
            Max word length is 1000. Contractions are treated as distinct tokens, eg. `n't`, `'s`, `'ll`.

        Note: Reference
            GloVe tokenizer source code available [here](https://github.com/stanfordnlp/GloVe/blob/master/src/common.c#L75)

        Args:
            text (str): text to be tokenized.

        Returns:
            list[str]: List of string tokens from text.
        """

        text = text.strip().lower()

        for contraction in self.contractions:
            pattern = r"([a-zA-Z]+)" + re.escape(contraction) + r"\b"
            text = re.sub(pattern, r"\1 " + contraction, text)

        for complete_contraction, sub_list in self.nested_contractions_dict.items():
            pattern = r"([a-zA-Z]+)" + re.escape(complete_contraction) + r"\b"
            if not re.compile(complete_contraction).search(text):
                for sub_contraction in sub_list:
                    pattern = r"([a-zA-Z]+)" + re.escape(sub_contraction) + r"\b"
                    text = re.sub(pattern, r"\1 " + sub_contraction, text)
            else:
                text = re.sub(pattern, r"\1 " + complete_contraction, text)

        words = text.split()

        tokens = []
        for word in words:
            # Split words when encountering a `'` that's not involved in a quote or a contraction.
            # eg. "Quell'ultimo" gets splitted into "Quell'" and "ultimo"
            if (
                "'" in word
                and word[0] != "'"
                and word[-1] != "'"
                and word not in self.contractions
                and word not in self.all_sub_contractions
            ):
                parts = word.split("'")
                for i, part in enumerate(parts):
                    if part:
                        if i < len(parts) - 1:
                            part += "'"
                        tokens.append(part)

            # Handle trailing punctuation
            elif word and word[-1] in string.punctuation and word[-1] != "'":
                # Strip all trailing punctuation
                core_word = word.rstrip(string.punctuation)
                punct = word[len(core_word) :]
                if core_word:
                    tokens.append(core_word)
                for p in punct:
                    tokens.append(p)

            else:
                if word:
                    tokens.append(word)

        # Truncate sequences by default
        return tokens[: self.tokenizer_max_len]

    def encode(
        self,
        input_sequence: str | list[str] | np.ndarray | torch.Tensor,
        pad_to_len: int | None = None,
        return_only_mask: bool = False,
    ) -> tuple[np.ndarray, np.ndarray]:
        """Tokenizer encode function.

        It also returns the mask to be used during attention in order not compute it with respect to `PAD` tokens.

        Note:
            The mask is designed to be True where there's a token with the model has to compute attention to, False otherwise.

        Args:
            input_sequence (str | list[str] | np.ndarray | torch.Tensor): Sequence to be encoded or already encoded (useful in decoding stage when this method is only used to provide the mask).
            pad_to_len (int | None, optional): Sequence length to pad `input_sequence` with. Defaults to None.
            return_only_mask (bool, optional): Return only attention mask. Defaults to False.

        Raises:
            VocabNotBuiltError: Raised when vocabulary is not built.

        Returns:
            tuple[np.ndarray, np.ndarray]: Array of token ids and mask.
        """

        if self.vocab_size == 0:
            raise VocabNotBuiltError()

        if return_only_mask:
            if isinstance(input_sequence, np.ndarray):
                mask = input_sequence != self.pad_token_idx
            elif isinstance(input_sequence, torch.Tensor):
                mask = (input_sequence != self.pad_token_idx).to(input_sequence.device)
            else:
                raise ValueError()
            return mask

        # Useful when building a TranslationDataset in order to execute tokenize method only once
        tokens = self.tokenize(input_sequence) if isinstance(input_sequence, str) else input_sequence

        # Add SOS and EOS tokens to given sequence
        if tokens[0] != self.special_tokens["sos_token"]:
            tokens.insert(0, self.special_tokens["sos_token"])
        if tokens[-1] != self.special_tokens["eos_token"]:
            tokens.append(self.special_tokens["eos_token"])

        token_ids = [self.vocab.get(token, self.unk_token_idx) for token in tokens]

        if pad_to_len is not None:  # Pad sequence to pad_to_len
            pad_to_len += 2  # Considering SOS and EOS tokens
            token_ids.extend([self.pad_token_idx for _ in range(pad_to_len - len(tokens))])

        token_ids = np.array(token_ids, dtype=np.long)

        # Mask to disable attention to pad tokens
        mask = np.array([token != self.pad_token_idx for token in token_ids], dtype=np.bool)

        return token_ids, mask

    def decode(self, token_ids: np.ndarray | list[str]) -> list[str]:
        """Decode token IDs.
        Returns the unknown token if the input token is not present in the vocabulary.

        Args:
            token_ids (np.ndarray | list[str]): Array or list of tokens ids to decode into text.

        Raises:
            VocabNotBuiltError: Vocabulary is not built.

        Returns:
            list[str]: Decoded text.
        """
        if self.vocab_size == 0:
            raise VocabNotBuiltError()
        return [self.vocab_reverse.get(idx, self.special_tokens["unk_token"]) for idx in token_ids]

build_vocab(tokens, min_freq=2, extend_with_glove=False, glove_version='glove.2024.wikigiga.50d', **kwargs)

Build vocabulary method.

Parameters:

Name Type Description Default
tokens list[str]

Tokens from dataset to build vocabulary on.

required
min_freq int

Minimum number of times a token has to appear in the dataset to be included in the vocabulary. Defaults to 2.

2
extend_with_glove bool

Enable vocabulary extension with GloVe tokens. Defaults to False.

False
glove_version str

GloVe version to use if extend_with_glove is True. Defaults to "glove.2024.wikigiga.50d".

'glove.2024.wikigiga.50d'

Raises:

Type Description
GloVeVersionError

Raised when supplied glove_version is unavailable.

Source code in src/tfs_mt/data_utils.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def build_vocab(
    self,
    tokens: list[str],
    min_freq: int = 2,
    extend_with_glove: bool = False,
    glove_version: str = "glove.2024.wikigiga.50d",
    **kwargs,
) -> None:
    """Build vocabulary method.

    Args:
        tokens (list[str]): Tokens from dataset to build vocabulary on.
        min_freq (int, optional): Minimum number of times a token has to appear in the dataset to be included in the vocabulary. Defaults to 2.
        extend_with_glove (bool, optional): Enable vocabulary extension with GloVe tokens. Defaults to False.
        glove_version (str, optional): GloVe version to use if `extend_with_glove` is `True`. Defaults to "glove.2024.wikigiga.50d".

    Raises:
        GloVeVersionError: Raised when supplied glove_version is unavailable.
    """
    vocab_list = []
    vocab_list.extend(self.special_tokens.values())
    vocab_set = set(vocab_list)

    if min_freq > 1:
        token_counts = Counter(tokens)
        for token, count in token_counts.items():
            # len(vocab_set) is O(1) operation cause the length is stored as a set attribute
            if (count >= min_freq and self.max_vocab_size == -1) or (
                count >= min_freq and self.max_vocab_size > 0 and len(vocab_set) < self.max_vocab_size
            ):
                vocab_set.add(token.lower())
    else:
        for token in tokens:
            if self.max_vocab_size == -1 or (self.max_vocab_size > 0 and len(vocab_set) < self.max_vocab_size):
                vocab_set.add(token.lower())

    vocab_list = list(vocab_set)
    del vocab_set

    if extend_with_glove and (
        self.max_vocab_size == -1 or (self.max_vocab_size > 0 and len(vocab_list) < self.max_vocab_size)
    ):
        print("Extending vocab with GloVe tokens...")

        if glove_version not in self.glove_available_versions:
            raise GloVeVersionError(glove_version, self.glove_available_versions)

        data_path = os.getcwd() + "/data" if "data_path" not in kwargs else kwargs["data_path"]

        glove_tokens = []

        try:
            glove_filepath = download_glove(data_path, glove_version)

            print(f"Loading GloVe {glove_version} tokens from file...")

            with open(glove_filepath, encoding="utf-8") as f:
                lines = f.readlines()

            # Parse GloVe tokens
            for line in lines:
                parts = line.strip().split()
                try:
                    float(parts[1])
                except ValueError:
                    continue
                else:
                    token = parts[0].lower()
                    glove_tokens.append(token)

            initial_size = len(vocab_list)
            if self.max_vocab_size != -1:
                for tok in glove_tokens:
                    if tok not in vocab_list:
                        vocab_list.append(tok)
                    if len(vocab_list) == self.max_vocab_size:
                        break
            else:
                vocab_list.extend(glove_tokens)
                vocab_list = list(set(vocab_list))

            print(f"Added {len(vocab_list) - initial_size} tokens from GloVe")

        except Exception as e:
            print(f"Error with GloVe processing GloVe: {e}")

    # Create mappings
    self.vocab = {token: idx for idx, token in enumerate(vocab_list)}
    self.vocab_reverse = dict(enumerate(vocab_list))

    del vocab_list

    print(f"Built vocabulary with {len(self.vocab.items())} tokens.")

decode(token_ids)

Decode token IDs. Returns the unknown token if the input token is not present in the vocabulary.

Parameters:

Name Type Description Default
token_ids ndarray | list[str]

Array or list of tokens ids to decode into text.

required

Raises:

Type Description
VocabNotBuiltError

Vocabulary is not built.

Returns:

Type Description
list[str]

list[str]: Decoded text.

Source code in src/tfs_mt/data_utils.py
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
def decode(self, token_ids: np.ndarray | list[str]) -> list[str]:
    """Decode token IDs.
    Returns the unknown token if the input token is not present in the vocabulary.

    Args:
        token_ids (np.ndarray | list[str]): Array or list of tokens ids to decode into text.

    Raises:
        VocabNotBuiltError: Vocabulary is not built.

    Returns:
        list[str]: Decoded text.
    """
    if self.vocab_size == 0:
        raise VocabNotBuiltError()
    return [self.vocab_reverse.get(idx, self.special_tokens["unk_token"]) for idx in token_ids]

encode(input_sequence, pad_to_len=None, return_only_mask=False)

Tokenizer encode function.

It also returns the mask to be used during attention in order not compute it with respect to PAD tokens.

Note

The mask is designed to be True where there's a token with the model has to compute attention to, False otherwise.

Parameters:

Name Type Description Default
input_sequence str | list[str] | ndarray | Tensor

Sequence to be encoded or already encoded (useful in decoding stage when this method is only used to provide the mask).

required
pad_to_len int | None

Sequence length to pad input_sequence with. Defaults to None.

None
return_only_mask bool

Return only attention mask. Defaults to False.

False

Raises:

Type Description
VocabNotBuiltError

Raised when vocabulary is not built.

Returns:

Type Description
tuple[ndarray, ndarray]

tuple[np.ndarray, np.ndarray]: Array of token ids and mask.

Source code in src/tfs_mt/data_utils.py
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
def encode(
    self,
    input_sequence: str | list[str] | np.ndarray | torch.Tensor,
    pad_to_len: int | None = None,
    return_only_mask: bool = False,
) -> tuple[np.ndarray, np.ndarray]:
    """Tokenizer encode function.

    It also returns the mask to be used during attention in order not compute it with respect to `PAD` tokens.

    Note:
        The mask is designed to be True where there's a token with the model has to compute attention to, False otherwise.

    Args:
        input_sequence (str | list[str] | np.ndarray | torch.Tensor): Sequence to be encoded or already encoded (useful in decoding stage when this method is only used to provide the mask).
        pad_to_len (int | None, optional): Sequence length to pad `input_sequence` with. Defaults to None.
        return_only_mask (bool, optional): Return only attention mask. Defaults to False.

    Raises:
        VocabNotBuiltError: Raised when vocabulary is not built.

    Returns:
        tuple[np.ndarray, np.ndarray]: Array of token ids and mask.
    """

    if self.vocab_size == 0:
        raise VocabNotBuiltError()

    if return_only_mask:
        if isinstance(input_sequence, np.ndarray):
            mask = input_sequence != self.pad_token_idx
        elif isinstance(input_sequence, torch.Tensor):
            mask = (input_sequence != self.pad_token_idx).to(input_sequence.device)
        else:
            raise ValueError()
        return mask

    # Useful when building a TranslationDataset in order to execute tokenize method only once
    tokens = self.tokenize(input_sequence) if isinstance(input_sequence, str) else input_sequence

    # Add SOS and EOS tokens to given sequence
    if tokens[0] != self.special_tokens["sos_token"]:
        tokens.insert(0, self.special_tokens["sos_token"])
    if tokens[-1] != self.special_tokens["eos_token"]:
        tokens.append(self.special_tokens["eos_token"])

    token_ids = [self.vocab.get(token, self.unk_token_idx) for token in tokens]

    if pad_to_len is not None:  # Pad sequence to pad_to_len
        pad_to_len += 2  # Considering SOS and EOS tokens
        token_ids.extend([self.pad_token_idx for _ in range(pad_to_len - len(tokens))])

    token_ids = np.array(token_ids, dtype=np.long)

    # Mask to disable attention to pad tokens
    mask = np.array([token != self.pad_token_idx for token in token_ids], dtype=np.bool)

    return token_ids, mask

tokenize(text)

Tokenizer based on GloVe word tokenizer in order to let the model be compatible with GloVe pretrained embeddings.

Note

Max word length is 1000. Contractions are treated as distinct tokens, eg. n't, 's, 'll.

Reference

GloVe tokenizer source code available here

Parameters:

Name Type Description Default
text str

text to be tokenized.

required

Returns:

Type Description
list[str]

list[str]: List of string tokens from text.

Source code in src/tfs_mt/data_utils.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
def tokenize(self, text: str) -> list[str]:
    """Tokenizer based on GloVe word tokenizer in order to let the model be compatible with GloVe pretrained embeddings.

    Note:
        Max word length is 1000. Contractions are treated as distinct tokens, eg. `n't`, `'s`, `'ll`.

    Note: Reference
        GloVe tokenizer source code available [here](https://github.com/stanfordnlp/GloVe/blob/master/src/common.c#L75)

    Args:
        text (str): text to be tokenized.

    Returns:
        list[str]: List of string tokens from text.
    """

    text = text.strip().lower()

    for contraction in self.contractions:
        pattern = r"([a-zA-Z]+)" + re.escape(contraction) + r"\b"
        text = re.sub(pattern, r"\1 " + contraction, text)

    for complete_contraction, sub_list in self.nested_contractions_dict.items():
        pattern = r"([a-zA-Z]+)" + re.escape(complete_contraction) + r"\b"
        if not re.compile(complete_contraction).search(text):
            for sub_contraction in sub_list:
                pattern = r"([a-zA-Z]+)" + re.escape(sub_contraction) + r"\b"
                text = re.sub(pattern, r"\1 " + sub_contraction, text)
        else:
            text = re.sub(pattern, r"\1 " + complete_contraction, text)

    words = text.split()

    tokens = []
    for word in words:
        # Split words when encountering a `'` that's not involved in a quote or a contraction.
        # eg. "Quell'ultimo" gets splitted into "Quell'" and "ultimo"
        if (
            "'" in word
            and word[0] != "'"
            and word[-1] != "'"
            and word not in self.contractions
            and word not in self.all_sub_contractions
        ):
            parts = word.split("'")
            for i, part in enumerate(parts):
                if part:
                    if i < len(parts) - 1:
                        part += "'"
                    tokens.append(part)

        # Handle trailing punctuation
        elif word and word[-1] in string.punctuation and word[-1] != "'":
            # Strip all trailing punctuation
            core_word = word.rstrip(string.punctuation)
            punct = word[len(core_word) :]
            if core_word:
                tokens.append(core_word)
            for p in punct:
                tokens.append(p)

        else:
            if word:
                tokens.append(word)

    # Truncate sequences by default
    return tokens[: self.tokenizer_max_len]

batch_collate_fn(batch, src_pad_token_id, tgt_pad_token_id, pad_all_to_len=-1)

Used to tell the Dataloader how to properly build a batch.

In order to correctly build a batch every sequence in it has to have the same length, so it pads the small sequences to the longest one. It does it for src, tgt, src_mask and tgt_mask.

Note

This function needs two PAD token ids since in this Trasformer implementation there are 2 distinct tokenizers with their own vocabulary. Each vocabulary is built independently and in parallel, so there's no guarantee that it will have the same ID in both.

Tip

By padding all sequences in the dataset to the same length higher GPU usage can achieved.

It has to be coupled with torch.compile usage and with the dynamic cudagraph compilation disabled (torch._inductor.config.triton.cudagraph_skip_dynamic_graphs = False)

Parameters:

Name Type Description Default
batch dict[str, Tensor | list[str]]

Batch of token ids and masks.

required
src_pad_token_id int

Pad token for source sequences.

required
tgt_pad_token_id int

Pad token for target sequences.

required
pad_all_to_len int

Sequence length to pad all sequences. If -1 it gets ignored. Defaults to -1.

-1

Returns:

Type Description
dict[str, Tensor | list[str]]

dict[str, torch.Tensor | list[str]]: Batch with padded sequences.

Source code in src/tfs_mt/data_utils.py
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
def batch_collate_fn(
    batch: dict[str, torch.Tensor | list[str]],
    src_pad_token_id: int,
    tgt_pad_token_id: int,
    pad_all_to_len: int = -1,
) -> dict[str, torch.Tensor | list[str]]:
    """Used to tell the Dataloader how to properly build a batch.

    In order to correctly build a batch every sequence in it has to have the same length,
    so it pads the small sequences to the longest one. It does it for `src`, `tgt`, `src_mask` and `tgt_mask`.

    Note:
        This function needs two `PAD` token ids since in this Trasformer implementation there are 2 distinct tokenizers
        with their own vocabulary.
        Each vocabulary is built independently and in parallel, so there's no guarantee that it will have the same ID in both.

    Tip:
        By padding all sequences in the dataset to the same length higher GPU usage can achieved.

        It has to be coupled with `torch.compile` usage and with the dynamic cudagraph compilation disabled (`torch._inductor.config.triton.cudagraph_skip_dynamic_graphs = False`)

    Args:
        batch (dict[str, torch.Tensor  |  list[str]]): Batch of token ids and masks.
        src_pad_token_id (int): Pad token for source sequences.
        tgt_pad_token_id (int): Pad token for target sequences.
        pad_all_to_len (int): Sequence length to pad all sequences. If -1 it gets ignored. Defaults to -1.

    Returns:
        dict[str, torch.Tensor | list[str]]: Batch with padded sequences.
    """

    keys = batch[0].keys()
    result = {}

    for key in keys:
        field_data = [sample[key] for sample in batch]

        if isinstance(field_data[0], torch.Tensor):
            if key == "src":
                pad_token_id = src_pad_token_id
            elif key == "tgt":
                pad_token_id = tgt_pad_token_id
            else:  # mask tensors
                pad_token_id = 0  # will be replaced with False

            if pad_all_to_len != -1:
                num_padding_values = pad_all_to_len - field_data[0].shape[0]
                if num_padding_values >= 0:
                    field_data[0] = F.pad(field_data[0], (0, num_padding_values), value=pad_token_id)

            padded_seq = pad_sequence(field_data, batch_first=True, padding_value=pad_token_id)
            result[key] = padded_seq
        else:  # src_text and tgt_text
            result[key] = field_data

    return result

build_data_utils(config, return_all=False, **kwargs)

Build tokenizers, datasets and dataloaders for Machine Translation.

Parameters:

Name Type Description Default
config DictConfig | ListConfig

Configuration object from omegaconf.

required
return_all bool

Whether to return dataloaders, datasets and tokenizers. Defaults to False.

False

Returns:

Type Description
tuple[DataLoader, DataLoader] | tuple[DataLoader, DataLoader, TranslationDataset, TranslationDataset, BaseTokenizer, BaseTokenizer]

tuple[DataLoader, DataLoader] | tuple[DataLoader, DataLoader, TranslationDataset, TranslationDataset, BaseTokenizer, BaseTokenizer]: Dataloaders or dataloaders, datasets and tokenizers.

Source code in src/tfs_mt/data_utils.py
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
def build_data_utils(
    config: DictConfig | ListConfig, return_all: bool = False, **kwargs
) -> (
    tuple[DataLoader, DataLoader]
    | tuple[DataLoader, DataLoader, TranslationDataset, TranslationDataset, BaseTokenizer, BaseTokenizer]
):
    """Build tokenizers, datasets and dataloaders for Machine Translation.

    Args:
        config (DictConfig | ListConfig): Configuration object from omegaconf.
        return_all (bool, optional): Whether to return dataloaders, datasets and tokenizers. Defaults to False.

    Returns:
        tuple[DataLoader, DataLoader] | tuple[DataLoader, DataLoader, TranslationDataset, TranslationDataset, BaseTokenizer, BaseTokenizer]: Dataloaders or dataloaders, datasets and tokenizers.
    """

    data = load_dataset(config.dataset.dataset_id, config.dataset.dataset_name, cache_dir=config.cache_ds_path)["train"]

    # TODO If resuming tokenizers from pretrained add check to ensure src and tgt languages matched between the provided config and the checkpoint one
    src_lang = config.dataset.src_lang
    tgt_lang = config.dataset.tgt_lang

    # Downsample the dataset. Mainly for computational contraints and to make tests.
    if config.dataset.max_len != -1:
        data = data.select(range(config.dataset.max_len))

    print(f"Train test splitting - {config.dataset.train_split}/{1 - float(config.dataset.train_split):.2f}")
    split = data.train_test_split(train_size=config.dataset.train_split, seed=config.seed, shuffle=False)
    train_data = split["train"]
    test_data = split["test"]

    try:
        train_src_texts, train_tgt_texts = [], []
        for text in train_data["translation"]:
            train_src_texts.append(text[src_lang])
            train_tgt_texts.append(text[tgt_lang])
        test_src_texts, test_tgt_texts = [], []
        for text in test_data["translation"]:
            test_src_texts.append(text[src_lang])
            test_tgt_texts.append(text[tgt_lang])
    except KeyError as err:  # Mainly here to ensure consistency when resuming model checkpoint
        raise DatasetLanguagesNotAvailableError(src_lang, tgt_lang) from err

    # Build tokenizers and vocabs. Both src and tgt tokenizers vocabs are built using the training data
    special_tokens = {
        "sos_token": config.tokenizer.sos_token,
        "eos_token": config.tokenizer.eos_token,
        "pad_token": config.tokenizer.pad_token,
        "unk_token": config.tokenizer.unk_token,
    }

    # Get tokenizers from pretrained
    if "src_tokenizer" in kwargs and "tgt_tokenizer" in kwargs:
        print("Getting tokenizers from pretrained...")
        src_tokenizer, tgt_tokenizer = kwargs.get("src_tokenizer"), kwargs.get("tgt_tokenizer")

    else:
        print("Building tokenizers and vocabularies...")
        tok_types_dict = {"word": WordTokenizer, "bpe": BPETokenizer}
        src_tokenizer = tok_types_dict[config.tokenizer.type](
            special_tokens,
            tokenizer_max_len=config.tokenizer.max_seq_len,
            max_vocab_size=config.tokenizer.max_vocab_size,
        )
        tgt_tokenizer = tok_types_dict[config.tokenizer.type](
            special_tokens,
            tokenizer_max_len=config.tokenizer.max_seq_len,
            max_vocab_size=config.tokenizer.max_vocab_size,
        )

        # To build the vocabularies texts are not filtered based on tokenizer.max_sequence_length
        src_tokens = [token for text in train_src_texts for token in src_tokenizer.tokenize(text)]
        tgt_tokens = [token for text in train_tgt_texts for token in tgt_tokenizer.tokenize(text)]

        # BPE Tokenizer
        if config.tokenizer.type == "bpe":
            # TODO BPE should take all available text in order to extract statistics
            # and the tokenize method can be done only if the vocabulary is built
            src_tokenizer.build_vocab(src_tokens)
            tgt_tokenizer.build_vocab(tgt_tokens)
        else:
            src_tokenizer.build_vocab(
                src_tokens,
                min_freq=config.tokenizer.vocab_min_freq,
                extend_with_glove=bool(
                    src_lang == "en" and config.model_configs.pretrained_word_embeddings == "GloVe"
                ),  # GloVe is trained on english only datasets so it doesn't make sense to extend non english vocabs
                glove_version=config.model_configs[config.chosen_model_size].glove_version,
            )
            tgt_tokenizer.build_vocab(
                tgt_tokens,
                min_freq=config.tokenizer.vocab_min_freq,
                extend_with_glove=bool(tgt_lang == "en" and config.model_configs.pretrained_word_embeddings == "GloVe"),
                glove_version=config.model_configs[config.chosen_model_size].glove_version,
            )

    config.tokenizer.src_sos_token_idx = src_tokenizer.sos_token_idx
    config.tokenizer.src_eos_token_idx = src_tokenizer.eos_token_idx
    config.tokenizer.src_pad_token_idx = src_tokenizer.pad_token_idx
    config.tokenizer.src_unk_token_idx = src_tokenizer.unk_token_idx
    config.tokenizer.tgt_sos_token_idx = tgt_tokenizer.sos_token_idx
    config.tokenizer.tgt_eos_token_idx = tgt_tokenizer.eos_token_idx
    config.tokenizer.tgt_pad_token_idx = tgt_tokenizer.pad_token_idx
    config.tokenizer.tgt_unk_token_idx = tgt_tokenizer.unk_token_idx

    print("Building datasets...")
    train_dataset = TranslationDataset(
        train_src_texts,
        train_tgt_texts,
        src_tokenizer,
        tgt_tokenizer,
        src_lang=config.dataset.src_lang,
        tgt_lang=config.dataset.tgt_lang,
        max_sequence_length=config.tokenizer.max_seq_len,
    )
    test_dataset = TranslationDataset(
        test_src_texts,
        test_tgt_texts,
        src_tokenizer,
        tgt_tokenizer,
        src_lang=config.dataset.src_lang,
        tgt_lang=config.dataset.tgt_lang,
        max_sequence_length=config.tokenizer.max_seq_len,
    )

    print(f"Train dataset length: {len(train_dataset)}")
    print(f"Test dataset length: {len(test_dataset)}")

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.train_dataloader.batch_size,
        num_workers=config.train_dataloader.num_workers,
        collate_fn=partial(
            batch_collate_fn,
            src_pad_token_id=config.tokenizer.src_pad_token_idx,
            tgt_pad_token_id=config.tokenizer.tgt_pad_token_idx,
            pad_all_to_len=config.tokenizer.max_seq_len if config.train_dataloader.pad_all_to_max_len else -1,
        ),
        shuffle=config.train_dataloader.shuffle,
        drop_last=config.train_dataloader.drop_last,
        persistent_workers=True,  # If True, the dataloader will not shut down the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive
        pin_memory=torch.cuda.is_available(),  # Pinned memory is not used anyway by torch where no accelerator is employed.
        prefetch_factor=config.train_dataloader.prefetch_factor,
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=config.test_dataloader.batch_size,
        num_workers=config.test_dataloader.num_workers,
        collate_fn=partial(
            batch_collate_fn,
            src_pad_token_id=config.tokenizer.src_pad_token_idx,
            tgt_pad_token_id=config.tokenizer.tgt_pad_token_idx,
            pad_all_to_len=config.tokenizer.max_seq_len if config.test_dataloader.pad_all_to_max_len else -1,
        ),
        shuffle=config.test_dataloader.shuffle,
        drop_last=config.test_dataloader.drop_last,
        persistent_workers=True,
        pin_memory=torch.cuda.is_available(),
        prefetch_factor=config.train_dataloader.prefetch_factor,
    )

    print(f"Train dataloader length: {len(train_dataloader)}")
    print(f"Test dataloader length: {len(test_dataloader)}")

    if return_all:
        return train_dataloader, test_dataloader, train_dataset, test_dataset, src_tokenizer, tgt_tokenizer
    return train_dataloader, test_dataloader

download_glove(output_dir, glove_version='glove.2024.wikigiga.50d')

Download GloVe embeddings and returns the filepath.

Source code in src/tfs_mt/data_utils.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def download_glove(output_dir: str, glove_version: str = "glove.2024.wikigiga.50d") -> str:
    """Download GloVe embeddings and returns the filepath."""
    glove_folder_path = output_dir + f"/{glove_version}"
    os.makedirs(glove_folder_path, exist_ok=True)

    url = f"https://nlp.stanford.edu/data/wordvecs/{glove_version}.zip"
    zip_path = output_dir + f"/{glove_version}.zip"

    glove_filepath = None
    for file in os.listdir(glove_folder_path):
        if file.endswith(".txt"):
            glove_filepath = os.path.join(glove_folder_path, file)
            break

    if glove_filepath is None:
        print(f"GloVe not found in {glove_folder_path}. Downloading GloVe ({glove_version})...")

        response = requests.get(url, stream=True, timeout=600)
        response.raise_for_status()

        with open(zip_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(glove_folder_path)
        os.remove(zip_path)

        for file in os.listdir(glove_folder_path):
            if file.endswith(".txt"):
                glove_filepath = os.path.join(glove_folder_path, file)
                break

    return glove_filepath