SequenceTokenizer

The base tokenizer from which other tokenizers are expected to inhereit.

The SequenceTokenizerSettings in particular holds common tokens and their associated ids for common special tokens. For example unk_token for unknown tokens.

SequenceTokenizer

The sequence tokenizer object.

__init__(self, settings=None)

Show source code in gcgc/tokenizer/base.py
148
149
150
    def __init__(self, settings: Optional[SequenceTokenizerSettings] = None):
        """Init the sequence tokenizer with a set of settings."""
        self.settings = settings or SequenceTokenizerSettings()

Init the sequence tokenizer with a set of settings.

apply_length_constraints(self, tokens)

Show source code in gcgc/tokenizer/base.py
171
172
173
174
175
176
177
178
179
180
181
182
183
    def apply_length_constraints(self, tokens: List[str]) -> List[str]:
        """Apply the constraints from the settings to the passed tokens list."""
        if self.settings.conform_length:
            tokens = _pad_token_list(tokens, self.settings.conform_length, self.settings.pad_token)
            return tokens[: self.settings.conform_length]

        if self.settings.min_length:
            tokens = _pad_token_list(tokens, self.settings.min_length, self.settings.pad_token)

        if self.settings.max_length:
            tokens = tokens[: self.settings.max_length]

        return tokens

Apply the constraints from the settings to the passed tokens list.

get_special_tokens_mask(self, token_ids)

Show source code in gcgc/tokenizer/base.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    def get_special_tokens_mask(self, token_ids: List[int]) -> List[int]:
        """Given the input set of tokens, return a list that demarcates special character.

        ```python
        # Assuming 1 is bos_token, 2 is eos_token, and 0 is pad_token.
        >>> tokenizer.get_special_tokens_mask([1, 34, 21, 0, 0, 0, 2])
        [1, 0, 0, 1, 1, 1, 1]
        ```

        Args:
            token_ids: The list of integer tokens that may or may not be special tokens according
                to the toeknizer's settings.

        Returns:
            A list of 0s and 1s, where the value is one if the token is a special character.

        """
        return [int(token_id in self.settings.special_token_ids) for token_id in token_ids]

Given the input set of tokens, return a list that demarcates special character.

# Assuming 1 is bos_token, 2 is eos_token, and 0 is pad_token.
>>> tokenizer.get_special_tokens_mask([1, 34, 21, 0, 0, 0, 2])
[1, 0, 0, 1, 1, 1, 1]

Parameters

Name Type Description Default
token_ids List[int] The list of integer tokens that may or may not be special tokens according to the toeknizer's settings. required

Returns

Type Description
List[int] A list of 0s and 1s, where the value is one if the token is a special character.

SequenceTokenizerSettings

The base tokenizer settings which inherits from pydantic.BaseSettings.

To see the available fields, use pydantic's `schema_json`.

```python
>>> print(SequenceTokenizerSettings.schema_json(indent=2))
{
  "title": "SequenceTokenizerSettings"
  ...
}

special_token_ids: List[int] (property, readonly)

Return the list of token ids corresponding to special tokens.

special_tokens: List[str] (property, readonly)

Return the set of special tokens that are not None.

validate_and_set_special_tokens(values) (classmethod)

Show source code in gcgc/tokenizer/base.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
    @root_validator
    def validate_and_set_special_tokens(cls, values):
        """Check that the tokens and token id pairs have corresponding values.

        If they do not, this will set the default value.

        """
        special_token_defaults = [
            ("pad_token", "|", 0),
            ("bos_token", ">", 1),
            ("eos_token", "<", 2),
            ("unk_token", "?", 3),
        ]
        for token_name, token_char, token_id in special_token_defaults:
            passed_token = values.get(token_name)
            passed_token_id = values.get(f"{token_name}_id")

            if (passed_token is None and passed_token_id is None) or (
                passed_token is not None and passed_token_id is not None
            ):
                continue

            if passed_token is None and passed_token_id is not None:
                values[token_name] = token_char

            if passed_token is not None and passed_token_id is None:
                values[f"{token_name}_id"] = token_id

        return values

Check that the tokens and token id pairs have corresponding values.

If they do not, this will set the default value.

validate_lengths(values) (classmethod)

Show source code in gcgc/tokenizer/base.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
    @root_validator
    def validate_lengths(cls, values):  # pylint: disable=no-self-argument, no-self-use
        """Check the length arguments are valid."""
        max_length = values.get("max_length")
        min_length = values.get("min_length")
        conform_length = values.get("conform_length")
        pad_token = values.get("pad_token")

        if conform_length is not None and (max_length is not None or min_length is not None):
            raise ValueError(f"If conform length is not None, max and min length can't be set.")

        if max_length is not None and min_length is not None and min_length > max_length:
            raise ValueError(
                f"Min length {min_length} cannot be more than max length {max_length}."
            )

        if (conform_length is not None or min_length is not None) and pad_token is None:
            raise ValueError("Cannot set conform_length or min_length without a pad_token.")

        return values

Check the length arguments are valid.