SequenceTokenizer
The base tokenizer from which other tokenizers are expected to inhereit.
The SequenceTokenizerSettings
in particular holds common tokens and their associated ids for
common special tokens. For example unk_token
for unknown tokens.
SequenceTokenizer
pydantic-model
The sequence tokenizer object.
special_token_ids: List[int]
property
readonly
Return the list of token ids corresponding to special tokens.
special_tokens: List[str]
property
readonly
Return the set of special tokens that are not None.
apply_length_constraints(self, tokens)
Apply the constraints from the settings to the passed tokens list.
Source code in gcgc/tokenizer/base.py
def apply_length_constraints(self, tokens: List[str]) -> List[str]:
"""Apply the constraints from the settings to the passed tokens list."""
if self.conform_length:
tokens = _pad_token_list(
tokens, self.conform_length, str(self.pad_token), self.pad_at_end,
)
return tokens[: self.conform_length]
if self.min_length:
tokens = _pad_token_list(tokens, self.min_length, str(self.pad_token), self.pad_at_end)
if self.max_length:
tokens = tokens[: self.max_length]
return tokens
bare_tokenizer(**values)
classmethod
Init a tokenizer like normal, but default all tokens to None.
Source code in gcgc/tokenizer/base.py
@classmethod
def bare_tokenizer(cls, **values):
"""Init a tokenizer like normal, but default all tokens to None."""
pad_token = values.pop("pad_token", None)
bos_token = values.pop("bos_token", None)
eos_token = values.pop("eos_token", None)
mask_token = values.pop("mask_token", None)
unk_token = values.pop("unk_token", None)
return cls(
pad_token=pad_token,
bos_token=bos_token,
eos_token=eos_token,
mask_token=mask_token,
unk_token=unk_token,
**values,
)
get_special_tokens_mask(self, token_ids)
Given the input set of tokens, return a list that demarcates special character.
>>> from gcgc import KmerTokenizer
>>> tokenizer = KmerTokenizer()
# Assuming 1 is bos_token, 2 is eos_token, and 0 is pad_token.
>>> tokenizer.get_special_tokens_mask([1, 34, 21, 0, 0, 0, 2])
[1, 0, 0, 1, 1, 1, 1]
Parameters:
Name | Type | Description | Default |
---|---|---|---|
token_ids |
List[int] |
The list of integer tokens that may or may not be special tokens according to the toeknizer's settings. |
required |
Returns:
Type | Description |
---|---|
List[int] |
A list of 0s and 1s, where the value is one if the token is a special character. |
Source code in gcgc/tokenizer/base.py
def get_special_tokens_mask(self, token_ids: List[int]) -> List[int]:
"""Given the input set of tokens, return a list that demarcates special character.
>>> from gcgc import KmerTokenizer
>>> tokenizer = KmerTokenizer()
# Assuming 1 is bos_token, 2 is eos_token, and 0 is pad_token.
>>> tokenizer.get_special_tokens_mask([1, 34, 21, 0, 0, 0, 2])
[1, 0, 0, 1, 1, 1, 1]
Args:
token_ids: The list of integer tokens that may or may not be special tokens according
to the toeknizer's settings.
Returns:
A list of 0s and 1s, where the value is one if the token is a special character.
"""
return [int(token_id in self.special_token_ids) for token_id in token_ids]
token_ids(values)
classmethod
Update the pad token id if appropriate.
Source code in gcgc/tokenizer/base.py
@root_validator
def token_ids(cls, values): # pylint: disable=no-self-use,no-self-argument
"""Update the pad token id if appropriate."""
vocab = values["vocab"]
for key, value in values.items():
if key.endswith("_token"):
token_id_field = f"{key}_id"
token_id = values[token_id_field]
if token_id is None and value is not None:
vocab.add_item(value)
values[token_id_field] = vocab[value]
elif token_id is not None and value is not None:
# pylint: disable=fixme
# TODO: check that token_id isn't already set (maybe its in the inverse?)
vocab[value] = token_id
return values
validate_lengths(values)
classmethod
Check the length arguments are valid.
Source code in gcgc/tokenizer/base.py
@root_validator
def validate_lengths(cls, values): # pylint: disable=no-self-argument, no-self-use
"""Check the length arguments are valid."""
max_length = values.get("max_length")
min_length = values.get("min_length")
conform_length = values.get("conform_length")
pad_token = values.get("pad_token")
if conform_length is not None and (max_length is not None or min_length is not None):
raise ValueError("If conform length is not None, max and min length can't be set.")
if max_length is not None and min_length is not None and min_length > max_length:
raise ValueError(
f"Min length {min_length} cannot be more than max length {max_length}."
)
if (conform_length is not None or min_length is not None) and pad_token is None:
raise ValueError("Cannot set conform_length or min_length without a pad_token.")
return values