KmerTokenizer

This module holds the kmer tokenizer and its settings.

The KmerTokenizer tokenizers incoming sequences into kmers of configurable size against a configurable alphabet.

For example, given incoming nucleotide sequences, using settings of kmer length 3 and stride 3 they encoded sequences will be codons (in the loose sense) with a vocabulary of size 64.

>>> from gcgc import KmerTokenizer
>>> tokenizer = KmerTokenizer.bare_tokenizer(kmer_length=3, kmer_stride=3)

>>> tokenizer.encode('AAATTTCCCGGG')
[0, 21, 42, 63]

>>> len(tokenizer.vocab)
64

>>> tokenizer.encode_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA'])
[[0, 21, 42, 63], [63, 42, 21, 0]]

>>> tokenizer.tokenize_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA'])
[['AAA', 'TTT', 'CCC', 'GGG'], ['GGG', 'CCC', 'TTT', 'AAA']]

KmerTokenizer pydantic-model

The Kmer Tokenizer that encodes sequences into chunked kmers.

augment_vocab_with_kmer(values) classmethod

Update the vocab of the SequenceTokenizer with a kmer alphabet.

Source code in gcgc/tokenizer/kmer_tokenzier.py
@root_validator
def augment_vocab_with_kmer(cls, values):  # pylint: disable=no-self-argument,no-self-use
    """Update the vocab of the SequenceTokenizer with a kmer alphabet."""
    vocab = values["vocab"]
    alphabet = values["alphabet"]
    kmer_length = values["kmer_length"]

    values["vocab"] = _create_kmer_vocab_from_token(vocab, alphabet, kmer_length)
    return values

decode(self, token_ids)

Given the token ids, convert the the list into the associated strings.

Source code in gcgc/tokenizer/kmer_tokenzier.py
def decode(self, token_ids: List[int]) -> List[str]:
    """Given the token ids, convert the the list into the associated strings."""
    return [self.vocab.itos[i] for i in token_ids]

encode(self, seq, add_unknown=False)

Encode the underlying sequence into a list of tokens ids.

Parameters:

Name Type Description Default
seq str

The incoming sequence to encode.

required
add_unknown bool

If True, add the unknown token rather than throwing an out of vocabulary error.

False

Returns:

Type Description
List[int]

A list of the encoded tokens.

Source code in gcgc/tokenizer/kmer_tokenzier.py
def encode(self, seq: str, add_unknown: bool = False) -> List[int]:
    """Encode the underlying sequence into a list of tokens ids.

    Args:
        seq: The incoming sequence to encode.
        add_unknown: If True, add the unknown token rather than throwing an out of vocabulary
            error.

    Returns:
        A list of the encoded tokens.

    """
    encoded = []

    for letter in self.tokenize(seq):
        try:
            encoded.append(self.vocab[letter])
        except KeyError:
            if add_unknown:
                encoded.append(self.unk_token_id)
            else:
                raise

    return encoded

encode_batch(self, seqs)

Tokenize a list of sequences.

Source code in gcgc/tokenizer/kmer_tokenzier.py
def encode_batch(self, seqs: List[str]) -> List[List[int]]:
    """Tokenize a list of sequences."""
    return [self.encode(s) for s in seqs]

resolve_alphabet(alphabet) classmethod

Resolve the alphabet if it's a named alphabet.

Parameters:

Name Type Description Default
alphabet str

The raw alphabet, either the sequence literal or a name of a preset alphabet.

required

Returns:

Type Description
str

The new alphabet.

Source code in gcgc/tokenizer/kmer_tokenzier.py
@validator("alphabet")
def resolve_alphabet(cls, alphabet: str) -> str:
    """Resolve the alphabet if it's a named alphabet.

    Args:
        alphabet: The raw alphabet, either the sequence literal or a name of a preset alphabet.

    Returns:
        The new alphabet.

    """
    return alphabets.resolve_alphabet(alphabet)

tokenize(self, seq)

Tokenize the sequence into a list of tokens.

Parameters:

Name Type Description Default
seq str

The sequence to encode.

required

Returns:

Type Description
List[str]

The list of strings that are the tokens.

Source code in gcgc/tokenizer/kmer_tokenzier.py
def tokenize(self, seq: str) -> List[str]:
    """Tokenize the sequence into a list of tokens.

    Args:
        seq: The sequence to encode.

    Returns:
        The list of strings that are the tokens.

    """
    seq_len = len(seq)

    if seq_len < self.kmer_length:
        raise ValueError(
            f"seq length {seq_len} cannot be less than the kmer size {self.kmer_length}"
        )

    if self.kmer_length == 1:
        kmer_list = list(seq)
    else:
        kmer_list = self._kmer_n(seq)

    if self.bos_token:
        kmer_list = [self.bos_token] + kmer_list

    if self.eos_token:
        kmer_list = kmer_list + [self.eos_token]

    return super().apply_length_constraints(kmer_list)

tokenize_batch(self, seqs)

Tokenize a list of sequences.

Source code in gcgc/tokenizer/kmer_tokenzier.py
def tokenize_batch(self, seqs: List[str]) -> List[List[str]]:
    """Tokenize a list of sequences."""
    return [self.tokenize(s) for s in seqs]