KmerTokenizer
This module holds the kmer tokenizer and its settings.
The KmerTokenizer tokenizers incoming sequences into kmers of configurable size against a
configurable alphabet.
For example, given incoming nucleotide sequences, using settings of kmer length 3 and stride 3 they encoded sequences will be codons (in the loose sense) with a vocabulary of size 64.
>>> from gcgc import KmerTokenizer
>>> tokenizer = KmerTokenizer.bare_tokenizer(kmer_length=3, kmer_stride=3)
>>> tokenizer.encode('AAATTTCCCGGG')
[0, 21, 42, 63]
>>> len(tokenizer.vocab)
64
>>> tokenizer.encode_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA'])
[[0, 21, 42, 63], [63, 42, 21, 0]]
>>> tokenizer.tokenize_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA'])
[['AAA', 'TTT', 'CCC', 'GGG'], ['GGG', 'CCC', 'TTT', 'AAA']]
        KmerTokenizer
  
      pydantic-model
  
      
The Kmer Tokenizer that encodes sequences into chunked kmers.
        
augment_vocab_with_kmer(values)        
  
      classmethod
  
      
Update the vocab of the SequenceTokenizer with a kmer alphabet.
Source code in gcgc/tokenizer/kmer_tokenzier.py
          @root_validator
def augment_vocab_with_kmer(cls, values):  # pylint: disable=no-self-argument,no-self-use
    """Update the vocab of the SequenceTokenizer with a kmer alphabet."""
    vocab = values["vocab"]
    alphabet = values["alphabet"]
    kmer_length = values["kmer_length"]
    values["vocab"] = _create_kmer_vocab_from_token(vocab, alphabet, kmer_length)
    return values
        
decode(self, token_ids)        
      
Given the token ids, convert the the list into the associated strings.
Source code in gcgc/tokenizer/kmer_tokenzier.py
          def decode(self, token_ids: List[int]) -> List[str]:
    """Given the token ids, convert the the list into the associated strings."""
    return [self.vocab.itos[i] for i in token_ids]
        
encode(self, seq, add_unknown=False)        
      
Encode the underlying sequence into a list of tokens ids.
Parameters:
| Name | Type | Description | Default | 
|---|---|---|---|
| seq | str | The incoming sequence to encode. | required | 
| add_unknown | bool | If True, add the unknown token rather than throwing an out of vocabulary error. | False | 
Returns:
| Type | Description | 
|---|---|
| List[int] | A list of the encoded tokens. | 
Source code in gcgc/tokenizer/kmer_tokenzier.py
          def encode(self, seq: str, add_unknown: bool = False) -> List[int]:
    """Encode the underlying sequence into a list of tokens ids.
    Args:
        seq: The incoming sequence to encode.
        add_unknown: If True, add the unknown token rather than throwing an out of vocabulary
            error.
    Returns:
        A list of the encoded tokens.
    """
    encoded = []
    for letter in self.tokenize(seq):
        try:
            encoded.append(self.vocab[letter])
        except KeyError:
            if add_unknown:
                encoded.append(self.unk_token_id)
            else:
                raise
    return encoded
        
encode_batch(self, seqs)        
      
Tokenize a list of sequences.
Source code in gcgc/tokenizer/kmer_tokenzier.py
          def encode_batch(self, seqs: List[str]) -> List[List[int]]:
    """Tokenize a list of sequences."""
    return [self.encode(s) for s in seqs]
        
resolve_alphabet(alphabet)        
  
      classmethod
  
      
Resolve the alphabet if it's a named alphabet.
Parameters:
| Name | Type | Description | Default | 
|---|---|---|---|
| alphabet | str | The raw alphabet, either the sequence literal or a name of a preset alphabet. | required | 
Returns:
| Type | Description | 
|---|---|
| str | The new alphabet. | 
Source code in gcgc/tokenizer/kmer_tokenzier.py
          @validator("alphabet")
def resolve_alphabet(cls, alphabet: str) -> str:
    """Resolve the alphabet if it's a named alphabet.
    Args:
        alphabet: The raw alphabet, either the sequence literal or a name of a preset alphabet.
    Returns:
        The new alphabet.
    """
    return alphabets.resolve_alphabet(alphabet)
        
tokenize(self, seq)        
      
Tokenize the sequence into a list of tokens.
Parameters:
| Name | Type | Description | Default | 
|---|---|---|---|
| seq | str | The sequence to encode. | required | 
Returns:
| Type | Description | 
|---|---|
| List[str] | The list of strings that are the tokens. | 
Source code in gcgc/tokenizer/kmer_tokenzier.py
          def tokenize(self, seq: str) -> List[str]:
    """Tokenize the sequence into a list of tokens.
    Args:
        seq: The sequence to encode.
    Returns:
        The list of strings that are the tokens.
    """
    seq_len = len(seq)
    if seq_len < self.kmer_length:
        raise ValueError(
            f"seq length {seq_len} cannot be less than the kmer size {self.kmer_length}"
        )
    if self.kmer_length == 1:
        kmer_list = list(seq)
    else:
        kmer_list = self._kmer_n(seq)
    if self.bos_token:
        kmer_list = [self.bos_token] + kmer_list
    if self.eos_token:
        kmer_list = kmer_list + [self.eos_token]
    return super().apply_length_constraints(kmer_list)
        
tokenize_batch(self, seqs)        
      
Tokenize a list of sequences.
Source code in gcgc/tokenizer/kmer_tokenzier.py
          def tokenize_batch(self, seqs: List[str]) -> List[List[str]]:
    """Tokenize a list of sequences."""
    return [self.tokenize(s) for s in seqs]