ThirdParty

Modules for huggingface's transformers.

GCGCTransformersTokenizer

A GCGC Tokenizer that is compatible with the transformers library.

vocab_size property readonly

Return the size of the vocabulary.

__init__(self, alphabet, kmer_length, kmer_stride, **kwargs) special

Init the GCGCTransformersTokenizer object.

Source code in gcgc/third_party/hf.py
def __init__(self, alphabet, kmer_length, kmer_stride, **kwargs: Dict[str, Any]):
    """Init the GCGCTransformersTokenizer object."""
    self.kmer_tokenizer = KmerTokenizer(
        alphabet=alphabet,
        kmer_length=kmer_length,
        kmer_stride=kmer_stride,
        bos_token=kwargs.get("bos_token"),
        eos_token=kwargs.get("eos_token"),
        unk_token=kwargs.get("unk_token"),
        pad_token=kwargs.get("pad_token"),
        mask_token=kwargs.get("mask_token"),
    )

    super().__init__(
        alphabet=alphabet, kmer_length=kmer_length, kmer_stride=kmer_stride, **kwargs,
    )

    self.init_inputs = (alphabet, kmer_length, kmer_stride)
    self.init_kwargs = kwargs

convert_tokens_to_string(tokens) staticmethod

Convert the tokens into a string.

Source code in gcgc/third_party/hf.py
@staticmethod
def convert_tokens_to_string(tokens: List[str]) -> str:
    """Convert the tokens into a string."""
    return "".join(tokens)

from_kmer_tokenizer(kmer_tokenizer) classmethod

Init from a kmer tokenizer.

Source code in gcgc/third_party/hf.py
@classmethod
def from_kmer_tokenizer(cls, kmer_tokenizer: KmerTokenizer):
    """Init from a kmer tokenizer."""
    return cls(
        kmer_tokenizer.alphabet,
        kmer_tokenizer.kmer_length,
        kmer_tokenizer.kmer_stride,
        bos_token=kmer_tokenizer.bos_token,
        eos_token=kmer_tokenizer.eos_token,
        unk_token=kmer_tokenizer.unk_token,
        pad_token=kmer_tokenizer.pad_token,
        mask_token=kmer_tokenizer.mask_token,
    )

get_vocab(self)

Return the vocabulary for this transformer.

Source code in gcgc/third_party/hf.py
def get_vocab(self):
    """Return the vocabulary for this transformer."""
    return self.kmer_tokenizer.vocab.stoi

save_vocabulary(self, save_directory)

Save the vocabulary string to integer map in the save_directory.

Source code in gcgc/third_party/hf.py
def save_vocabulary(self, save_directory) -> Tuple[str]:
    """Save the vocabulary string to integer map in the save_directory."""
    vocab_location = os.path.join(save_directory, self.VOCABULARY_FILENAME)

    with open(vocab_location, "w") as outf:
        json.dump(self.kmer_tokenizer.vocab.stoi, outf)

    return (vocab_location,)

GenomicDataset

GenomicDataset can be used to load sequence information into a format aminable to PyTorch.

__getitem__(self, i) special

Get the record from the index.

Source code in gcgc/third_party/hf.py
def __getitem__(self, i: int):
    """Get the record from the index."""
    for value in self._file_index.values():
        try:
            seq_record = value[i]
        except KeyError:
            continue

        tokenized = self._tokenizer.encode(str(seq_record.seq))
        # pylint: disable=not-callable, no-member
        return torch.tensor(tokenized, dtype=torch.long)

    # pylint: disable=useless-else-on-loop
    else:
        raise RuntimeError(f"Exausted file index while looking for {i}.")

__init__(self, file_index, tokenizer) special

Initialize the GenomicDataset object.

Source code in gcgc/third_party/hf.py
def __init__(
    self, file_index: Dict[Path, File._IndexedSeqFileDict], tokenizer: GCGCTransformersTokenizer
):
    """Initialize the GenomicDataset object."""
    self._file_index = file_index
    self._tokenizer = tokenizer

    super().__init__()

__len__(self) special

Return the length of the dataset.

Source code in gcgc/third_party/hf.py
def __len__(self) -> int:
    """Return the length of the dataset."""
    return sum(len(v) for v in self._file_index.values())

from_path(path, tokenizer, file_format='fasta') classmethod

Init from a single file. This is a convenience method that delegates to from_paths.

Source code in gcgc/third_party/hf.py
@classmethod
def from_path(
    cls, path: Path, tokenizer: GCGCTransformersTokenizer, file_format: str = "fasta",
) -> "GenomicDataset":
    """Init from a single file. This is a convenience method that delegates to from_paths."""
    return cls.from_paths([path], tokenizer, file_format)

from_paths(path_sequence, tokenizer, file_format='fasta') classmethod

Initialize the GenomicDataset from a pathlib.Path sequence.

Source code in gcgc/third_party/hf.py
@classmethod
def from_paths(
    cls,
    path_sequence: Sequence[Path],
    tokenizer: GCGCTransformersTokenizer,
    file_format="fasta",
) -> "GenomicDataset":
    """Initialize the GenomicDataset from a pathlib.Path sequence."""
    file_index = {}
    seq_indexer = _SequenceIndexer()

    for file_path in sorted(path_sequence):
        file_index[file_path] = SeqIO.index(
            str(file_path), file_format, key_function=seq_indexer
        )

    return cls(file_index, tokenizer)

Module holding pytorch code.

GenomicDataset

GenomicDataset can be used to load sequence information into a format aminable to PyTorch.

__getitem__(self, i) special

Get the record from the index.

Source code in gcgc/third_party/pytorch.py
def __getitem__(self, i: int):
    """Get the record from the index."""
    for value in self._file_index.values():
        try:
            seq_record = value[i]
        except KeyError:
            continue

        tokenized = self._tokenizer.encode(str(seq_record.seq))
        # pylint: disable=not-callable, no-member
        return torch.tensor(tokenized, dtype=torch.long)

    # pylint: disable=useless-else-on-loop
    else:
        raise RuntimeError(f"Exausted file index while looking for {i}.")

__init__(self, file_index, tokenizer) special

Initialize the GenomicDataset object.

Source code in gcgc/third_party/pytorch.py
def __init__(self, file_index: Dict[Path, File._IndexedSeqFileDict], tokenizer: KmerTokenizer):
    """Initialize the GenomicDataset object."""
    self._file_index = file_index
    self._tokenizer = tokenizer

    super().__init__()

__len__(self) special

Return the length of the dataset.

Source code in gcgc/third_party/pytorch.py
def __len__(self) -> int:
    """Return the length of the dataset."""
    return sum(len(v) for v in self._file_index.values())

from_path(path, tokenizer, file_format='fasta') classmethod

Init from a single file. This is a convenience method that delegates to from_paths.

Source code in gcgc/third_party/pytorch.py
@classmethod
def from_path(
    cls, path: Path, tokenizer: KmerTokenizer, file_format: str = "fasta",
) -> "GenomicDataset":
    """Init from a single file. This is a convenience method that delegates to from_paths."""
    return cls.from_paths([path], tokenizer, file_format)

from_paths(path_sequence, tokenizer, file_format='fasta') classmethod

Initialize the GenomicDataset from a pathlib.Path sequence.

Source code in gcgc/third_party/pytorch.py
@classmethod
def from_paths(
    cls, path_sequence: Sequence[Path], tokenizer: KmerTokenizer, file_format="fasta",
) -> "GenomicDataset":
    """Initialize the GenomicDataset from a pathlib.Path sequence."""
    file_index = {}
    seq_indexer = _SequenceIndexer()

    for file_path in sorted(path_sequence):
        file_index[file_path] = SeqIO.index(
            str(file_path), file_format, key_function=seq_indexer
        )

    return cls(file_index, tokenizer)