KmerTokenizer

This module holds the kmer tokenizer and its settings.

The KmerTokenizer tokenizers incoming sequences into kmers of configurable size against a configurable alphabet.

For example, given incoming nucleotide sequences, using settings of kmer length 3 and stride 3 they encoded sequences will be codons (in the loose sense) with a vocabulary of size 64.

from gcgc import KmerTokenizer tokenizer = KmerTokenizer.bare_tokenizer(kmer_length=3, kmer_stride=3)

tokenizer.encode('AAATTTCCCGGG') [0, 21, 42, 63]

len(tokenizer.vocab) 64

tokenizer.encode_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA']) [[0, 21, 42, 63], [63, 42, 21, 0]]

tokenizer.tokenize_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA']) [['AAA', 'TTT', 'CCC', 'GGG'], ['GGG', 'CCC', 'TTT', 'AAA']]

KmerTokenizer pydantic-model

The Kmer Tokenizer that encodes sequences into chunked kmers.

augment_vocab_with_kmer(values) classmethod

Update the vocab of the SequenceTokenizer with a kmer alphabet.

Source code in gcgc/tokenizer/kmer_tokenzier.py
57
58
59
60
61
62
63
64
65
@root_validator
def augment_vocab_with_kmer(cls, values):  # pylint: disable=no-self-argument,no-self-use
    """Update the vocab of the SequenceTokenizer with a kmer alphabet."""
    vocab = values["vocab"]
    alphabet = values["alphabet"]
    kmer_length = values["kmer_length"]

    values["vocab"] = _create_kmer_vocab_from_token(vocab, alphabet, kmer_length)
    return values

decode(self, token_ids)

Given the token ids, convert the the list into the associated strings.

Source code in gcgc/tokenizer/kmer_tokenzier.py
92
93
94
def decode(self, token_ids: List[int]) -> List[str]:
    """Given the token ids, convert the the list into the associated strings."""
    return [self.vocab.itos[i] for i in token_ids]

encode(self, seq, add_unknown=False)

Encode the underlying sequence into a list of tokens ids.

Parameters:

Name Type Description Default
seq str

The incoming sequence to encode.

required
add_unknown bool

If True, add the unknown token rather than throwing an out of vocabulary error.

False

Returns:

Type Description
List[int]

A list of the encoded tokens.

Source code in gcgc/tokenizer/kmer_tokenzier.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def encode(self, seq: str, add_unknown: bool = False) -> List[int]:
    """Encode the underlying sequence into a list of tokens ids.

    Args:
        seq: The incoming sequence to encode.
        add_unknown: If True, add the unknown token rather than throwing an out of vocabulary
            error.

    Returns:
        A list of the encoded tokens.

    """
    encoded = []

    for letter in self.tokenize(seq):
        try:
            encoded.append(self.vocab[letter])
        except KeyError:
            if add_unknown:
                encoded.append(self.unk_token_id)
            else:
                raise

    return encoded

encode_batch(self, seqs)

Tokenize a list of sequences.

Source code in gcgc/tokenizer/kmer_tokenzier.py
155
156
157
def encode_batch(self, seqs: List[str]) -> List[List[int]]:
    """Tokenize a list of sequences."""
    return [self.encode(s) for s in seqs]

resolve_alphabet(alphabet) classmethod

Resolve the alphabet if it's a named alphabet.

Parameters:

Name Type Description Default
alphabet str

The raw alphabet, either the sequence literal or a name of a preset alphabet.

required

Returns:

Type Description
str

The new alphabet.

Source code in gcgc/tokenizer/kmer_tokenzier.py
68
69
70
71
72
73
74
75
76
77
78
79
@validator("alphabet")
def resolve_alphabet(cls, alphabet: str) -> str:
    """Resolve the alphabet if it's a named alphabet.

    Args:
        alphabet: The raw alphabet, either the sequence literal or a name of a preset alphabet.

    Returns:
        The new alphabet.

    """
    return alphabets.resolve_alphabet(alphabet)

tokenize(self, seq)

Tokenize the sequence into a list of tokens.

Parameters:

Name Type Description Default
seq str

The sequence to encode.

required

Returns:

Type Description
List[str]

The list of strings that are the tokens.

Source code in gcgc/tokenizer/kmer_tokenzier.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def tokenize(self, seq: str) -> List[str]:
    """Tokenize the sequence into a list of tokens.

    Args:
        seq: The sequence to encode.

    Returns:
        The list of strings that are the tokens.

    """
    seq_len = len(seq)

    if seq_len < self.kmer_length:
        raise ValueError(
            f"seq length {seq_len} cannot be less than the kmer size {self.kmer_length}"
        )

    if self.kmer_length == 1:
        kmer_list = list(seq)
    else:
        kmer_list = self._kmer_n(seq)

    if self.bos_token:
        kmer_list = [self.bos_token] + kmer_list

    if self.eos_token:
        kmer_list = kmer_list + [self.eos_token]

    return super().apply_length_constraints(kmer_list)

tokenize_batch(self, seqs)

Tokenize a list of sequences.

Source code in gcgc/tokenizer/kmer_tokenzier.py
151
152
153
def tokenize_batch(self, seqs: List[str]) -> List[List[str]]:
    """Tokenize a list of sequences."""
    return [self.tokenize(s) for s in seqs]