SequenceTokenizer
The base tokenizer from which other tokenizers are expected to inhereit.
The SequenceTokenizerSettings in particular holds common tokens and their associated ids for
common special tokens. For example unk_token for unknown tokens.
        SequenceTokenizer
  
      pydantic-model
  
      
The sequence tokenizer object.
        
special_token_ids: List[int]        
  
      property
      readonly
  
      
Return the list of token ids corresponding to special tokens.
        
special_tokens: List[str]        
  
      property
      readonly
  
      
Return the set of special tokens that are not None.
        
apply_length_constraints(self, tokens)        
      
Apply the constraints from the settings to the passed tokens list.
Source code in gcgc/tokenizer/base.py
          def apply_length_constraints(self, tokens: List[str]) -> List[str]:
    """Apply the constraints from the settings to the passed tokens list."""
    if self.conform_length:
        tokens = _pad_token_list(
            tokens, self.conform_length, str(self.pad_token), self.pad_at_end,
        )
        return tokens[: self.conform_length]
    if self.min_length:
        tokens = _pad_token_list(tokens, self.min_length, str(self.pad_token), self.pad_at_end)
    if self.max_length:
        tokens = tokens[: self.max_length]
    return tokens
        
bare_tokenizer(**values)        
  
      classmethod
  
      
Init a tokenizer like normal, but default all tokens to None.
Source code in gcgc/tokenizer/base.py
          @classmethod
def bare_tokenizer(cls, **values):
    """Init a tokenizer like normal, but default all tokens to None."""
    pad_token = values.pop("pad_token", None)
    bos_token = values.pop("bos_token", None)
    eos_token = values.pop("eos_token", None)
    mask_token = values.pop("mask_token", None)
    unk_token = values.pop("unk_token", None)
    return cls(
        pad_token=pad_token,
        bos_token=bos_token,
        eos_token=eos_token,
        mask_token=mask_token,
        unk_token=unk_token,
        **values,
    )
        
get_special_tokens_mask(self, token_ids)        
      
Given the input set of tokens, return a list that demarcates special character.
>>> from gcgc import KmerTokenizer
>>> tokenizer = KmerTokenizer()
# Assuming 1 is bos_token, 2 is eos_token, and 0 is pad_token.
>>> tokenizer.get_special_tokens_mask([1, 34, 21, 0, 0, 0, 2])
[1, 0, 0, 1, 1, 1, 1]
Parameters:
| Name | Type | Description | Default | 
|---|---|---|---|
token_ids | 
List[int] | 
 The list of integer tokens that may or may not be special tokens according to the toeknizer's settings.  | 
required | 
Returns:
| Type | Description | 
|---|---|
List[int] | 
 A list of 0s and 1s, where the value is one if the token is a special character.  | 
Source code in gcgc/tokenizer/base.py
          def get_special_tokens_mask(self, token_ids: List[int]) -> List[int]:
    """Given the input set of tokens, return a list that demarcates special character.
        >>> from gcgc import KmerTokenizer
        >>> tokenizer = KmerTokenizer()
        # Assuming 1 is bos_token, 2 is eos_token, and 0 is pad_token.
        >>> tokenizer.get_special_tokens_mask([1, 34, 21, 0, 0, 0, 2])
        [1, 0, 0, 1, 1, 1, 1]
    Args:
        token_ids: The list of integer tokens that may or may not be special tokens according
            to the toeknizer's settings.
    Returns:
        A list of 0s and 1s, where the value is one if the token is a special character.
    """
    return [int(token_id in self.special_token_ids) for token_id in token_ids]
        
token_ids(values)        
  
      classmethod
  
      
Update the pad token id if appropriate.
Source code in gcgc/tokenizer/base.py
          @root_validator
def token_ids(cls, values):  # pylint: disable=no-self-use,no-self-argument
    """Update the pad token id if appropriate."""
    vocab = values["vocab"]
    for key, value in values.items():
        if key.endswith("_token"):
            token_id_field = f"{key}_id"
            token_id = values[token_id_field]
            if token_id is None and value is not None:
                vocab.add_item(value)
                values[token_id_field] = vocab[value]
            elif token_id is not None and value is not None:
                # pylint: disable=fixme
                # TODO: check that token_id isn't already set (maybe its in the inverse?)
                vocab[value] = token_id
    return values
        
validate_lengths(values)        
  
      classmethod
  
      
Check the length arguments are valid.
Source code in gcgc/tokenizer/base.py
          @root_validator
def validate_lengths(cls, values):  # pylint: disable=no-self-argument, no-self-use
    """Check the length arguments are valid."""
    max_length = values.get("max_length")
    min_length = values.get("min_length")
    conform_length = values.get("conform_length")
    pad_token = values.get("pad_token")
    if conform_length is not None and (max_length is not None or min_length is not None):
        raise ValueError("If conform length is not None, max and min length can't be set.")
    if max_length is not None and min_length is not None and min_length > max_length:
        raise ValueError(
            f"Min length {min_length} cannot be more than max length {max_length}."
        )
    if (conform_length is not None or min_length is not None) and pad_token is None:
        raise ValueError("Cannot set conform_length or min_length without a pad_token.")
    return values