ThirdParty

Module holding third party code.

GCGCTransformersTokenizer

A GCGC Tokenizer that is compatible with the transformers library.

vocab_size property readonly

Return the size of the vocabulary.

__init__(self, alphabet, kmer_length, kmer_stride, **kwargs) special

Init the GCGCTransformersTokenizer object.

Source code in gcgc/third_party.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(self, alphabet, kmer_length, kmer_stride, **kwargs: Dict[str, Any]):
    """Init the GCGCTransformersTokenizer object."""
    self.kmer_tokenizer = KmerTokenizer(
        alphabet=alphabet,
        kmer_length=kmer_length,
        kmer_stride=kmer_stride,
        bos_token=kwargs.get("bos_token"),
        eos_token=kwargs.get("eos_token"),
        unk_token=kwargs.get("unk_token"),
        pad_token=kwargs.get("pad_token"),
        mask_token=kwargs.get("mask_token"),
    )

    super().__init__(
        alphabet=alphabet, kmer_length=kmer_length, kmer_stride=kmer_stride, **kwargs,
    )

    self.init_inputs = (alphabet, kmer_length, kmer_stride)
    self.init_kwargs = kwargs

convert_tokens_to_string(self, tokens)

Convert the tokens into a string.

Source code in gcgc/third_party.py
88
89
90
def convert_tokens_to_string(self, tokens: List[str]) -> str:
    """Convert the tokens into a string."""
    return "".join(tokens)

from_kmer_tokenizer(kmer_tokenizer) classmethod

Init from a kmer tokenizer.

Source code in gcgc/third_party.py
56
57
58
59
60
61
62
63
64
65
66
67
68
@classmethod
def from_kmer_tokenizer(cls, kmer_tokenizer: KmerTokenizer):
    """Init from a kmer tokenizer."""
    return cls(
        kmer_tokenizer.alphabet,
        kmer_tokenizer.kmer_length,
        kmer_tokenizer.kmer_stride,
        bos_token=kmer_tokenizer.bos_token,
        eos_token=kmer_tokenizer.eos_token,
        unk_token=kmer_tokenizer.unk_token,
        pad_token=kmer_tokenizer.pad_token,
        mask_token=kmer_tokenizer.mask_token,
    )

get_vocab(self)

Return the vocabulary for this transformer.

Source code in gcgc/third_party.py
70
71
72
def get_vocab(self):
    """Return the vocabulary for this transformer."""
    return self.kmer_tokenizer.vocab.stoi

save_vocabulary(self, save_directory)

Save the vocabulary string to integer map in the save_directory.

Source code in gcgc/third_party.py
92
93
94
95
96
97
98
99
def save_vocabulary(self, save_directory) -> Tuple[str]:
    """Save the vocabulary string to integer map in the save_directory."""
    vocab_location = os.path.join(save_directory, self.VOCABULARY_FILENAME)

    with open(vocab_location, "w") as outf:
        json.dump(self.kmer_tokenizer.vocab.stoi, outf)

    return (vocab_location,)

GenomicDataset

GenomicDataset can be used to load sequence information into a format aminable to PyTorch.

__getitem__(self, i) special

Get the record from the index.

Source code in gcgc/third_party.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def __getitem__(self, i: int):
    """Get the record from the index."""
    for value in self._file_index.values():
        try:
            seq_record = value[i]
        except KeyError:
            continue

        tokenized = self._tokenizer.encode(str(seq_record.seq))
        # pylint: disable=not-callable, no-member
        return torch.tensor(tokenized, dtype=torch.long)

    # pylint: disable=useless-else-on-loop
    else:
        raise RuntimeError(f"Exausted file index while looking for {i}.")

__init__(self, file_index, tokenizer) special

Initialize the GenomicDataset object.

Source code in gcgc/third_party.py
124
125
126
127
128
129
130
131
def __init__(
    self, file_index: Dict[Path, File._IndexedSeqFileDict], tokenizer: GCGCTransformersTokenizer
):
    """Initialize the GenomicDataset object."""
    self._file_index = file_index
    self._tokenizer = tokenizer

    super().__init__()

__len__(self) special

Return the length of the dataset.

Source code in gcgc/third_party.py
163
164
165
def __len__(self) -> int:
    """Return the length of the dataset."""
    return sum(len(v) for v in self._file_index.values())

from_path(path, tokenizer, file_format='fasta', alphabet=ExtendedIUPACProtein()) classmethod

Init from a single file. This is a convenience method that delegates to from_paths.

Source code in gcgc/third_party.py
133
134
135
136
137
138
139
140
141
142
@classmethod
def from_path(
    cls,
    path: Path,
    tokenizer: GCGCTransformersTokenizer,
    file_format: str = "fasta",
    alphabet=IUPAC.ExtendedIUPACProtein(),
) -> "GenomicDataset":
    """Init from a single file. This is a convenience method that delegates to from_paths."""
    return cls.from_paths([path], tokenizer, file_format, alphabet)

from_paths(path_sequence, tokenizer, file_format='fasta', alphabet=ExtendedIUPACProtein()) classmethod

Initialize the GenomicDataset from a pathlib.Path sequence.

Source code in gcgc/third_party.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
@classmethod
def from_paths(
    cls,
    path_sequence: Sequence[Path],
    tokenizer: GCGCTransformersTokenizer,
    file_format="fasta",
    alphabet=IUPAC.ExtendedIUPACProtein(),
) -> "GenomicDataset":
    """Initialize the GenomicDataset from a pathlib.Path sequence."""
    file_index = {}
    seq_indexer = _SequenceIndexer()

    for file_path in sorted(path_sequence):
        file_index[file_path] = SeqIO.index(
            str(file_path), file_format, key_function=seq_indexer, alphabet=alphabet
        )

    return cls(file_index, tokenizer)