KmerTokenizer
This module holds the kmer tokenizer and its settings.
The KmerTokenizer
tokenizers incoming sequences into kmers of configurable size against a
configurable alphabet.
For example, given incoming nucleotide sequences, using settings of kmer length 3 and stride 3 they encoded sequences will be codons (in the loose sense) with a vocabulary of size 64.
>>> from gcgc import KmerTokenizer
>>> tokenizer = KmerTokenizer.bare_tokenizer(kmer_length=3, kmer_stride=3)
>>> tokenizer.encode('AAATTTCCCGGG')
[0, 21, 42, 63]
>>> len(tokenizer.vocab)
64
>>> tokenizer.encode_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA'])
[[0, 21, 42, 63], [63, 42, 21, 0]]
>>> tokenizer.tokenize_batch(['AAATTTCCCGGG', 'GGGCCCTTTAAA'])
[['AAA', 'TTT', 'CCC', 'GGG'], ['GGG', 'CCC', 'TTT', 'AAA']]
KmerTokenizer
pydantic-model
The Kmer Tokenizer that encodes sequences into chunked kmers.
augment_vocab_with_kmer(values)
classmethod
Update the vocab of the SequenceTokenizer with a kmer alphabet.
Source code in gcgc/tokenizer/kmer_tokenzier.py
@root_validator
def augment_vocab_with_kmer(cls, values): # pylint: disable=no-self-argument,no-self-use
"""Update the vocab of the SequenceTokenizer with a kmer alphabet."""
vocab = values["vocab"]
alphabet = values["alphabet"]
kmer_length = values["kmer_length"]
values["vocab"] = _create_kmer_vocab_from_token(vocab, alphabet, kmer_length)
return values
decode(self, token_ids)
Given the token ids, convert the the list into the associated strings.
Source code in gcgc/tokenizer/kmer_tokenzier.py
def decode(self, token_ids: List[int]) -> List[str]:
"""Given the token ids, convert the the list into the associated strings."""
return [self.vocab.itos[i] for i in token_ids]
encode(self, seq, add_unknown=False)
Encode the underlying sequence into a list of tokens ids.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
seq |
str |
The incoming sequence to encode. |
required |
add_unknown |
bool |
If True, add the unknown token rather than throwing an out of vocabulary error. |
False |
Returns:
Type | Description |
---|---|
List[int] |
A list of the encoded tokens. |
Source code in gcgc/tokenizer/kmer_tokenzier.py
def encode(self, seq: str, add_unknown: bool = False) -> List[int]:
"""Encode the underlying sequence into a list of tokens ids.
Args:
seq: The incoming sequence to encode.
add_unknown: If True, add the unknown token rather than throwing an out of vocabulary
error.
Returns:
A list of the encoded tokens.
"""
encoded = []
for letter in self.tokenize(seq):
try:
encoded.append(self.vocab[letter])
except KeyError:
if add_unknown:
encoded.append(self.unk_token_id)
else:
raise
return encoded
encode_batch(self, seqs)
Tokenize a list of sequences.
Source code in gcgc/tokenizer/kmer_tokenzier.py
def encode_batch(self, seqs: List[str]) -> List[List[int]]:
"""Tokenize a list of sequences."""
return [self.encode(s) for s in seqs]
resolve_alphabet(alphabet)
classmethod
Resolve the alphabet if it's a named alphabet.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
alphabet |
str |
The raw alphabet, either the sequence literal or a name of a preset alphabet. |
required |
Returns:
Type | Description |
---|---|
str |
The new alphabet. |
Source code in gcgc/tokenizer/kmer_tokenzier.py
@validator("alphabet")
def resolve_alphabet(cls, alphabet: str) -> str:
"""Resolve the alphabet if it's a named alphabet.
Args:
alphabet: The raw alphabet, either the sequence literal or a name of a preset alphabet.
Returns:
The new alphabet.
"""
return alphabets.resolve_alphabet(alphabet)
tokenize(self, seq)
Tokenize the sequence into a list of tokens.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
seq |
str |
The sequence to encode. |
required |
Returns:
Type | Description |
---|---|
List[str] |
The list of strings that are the tokens. |
Source code in gcgc/tokenizer/kmer_tokenzier.py
def tokenize(self, seq: str) -> List[str]:
"""Tokenize the sequence into a list of tokens.
Args:
seq: The sequence to encode.
Returns:
The list of strings that are the tokens.
"""
seq_len = len(seq)
if seq_len < self.kmer_length:
raise ValueError(
f"seq length {seq_len} cannot be less than the kmer size {self.kmer_length}"
)
if self.kmer_length == 1:
kmer_list = list(seq)
else:
kmer_list = self._kmer_n(seq)
if self.bos_token:
kmer_list = [self.bos_token] + kmer_list
if self.eos_token:
kmer_list = kmer_list + [self.eos_token]
return super().apply_length_constraints(kmer_list)
tokenize_batch(self, seqs)
Tokenize a list of sequences.
Source code in gcgc/tokenizer/kmer_tokenzier.py
def tokenize_batch(self, seqs: List[str]) -> List[List[str]]:
"""Tokenize a list of sequences."""
return [self.tokenize(s) for s in seqs]