BioSequencePiece

Module for Sentence Piece tokenization.

This tokenizer uses bindings to call a faster implementation of the algorithm, for now this is the google library: https://github.com/google/sentencepiece.

This tokenizer needs to be trained to learn its tokenization logic. See the fit* methods for different ways to fit the model prior to use.

BioSequencePiece

A sentence piece for model on biological sequences.

sp_processor: SentencePieceProcessor (property, readonly)

Return the SequencePiece process object.

__init__(self, settings=None)

Show source code in gcgc/tokenizer/sentence_piece_tokenizer.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
    def __init__(self, settings: Optional[BioSequencePieceSettings] = None):
        """Init the BioSequencePiece class.

        Args:
            settings: The settings for the tokenizer.

        """
        if not has_spm or not shutil.which("spm_train"):
            raise RuntimeError("Trying to use sentencepiece but the python library is missing!")

        self.settings = settings or BioSequencePieceSettings()
        super().__init__(settings)

        self.vocab: Dict[str, int] = {}
        self._sp_processor = None

Init the BioSequencePiece class.

Parameters

Name Type Description Default
settings Optional[gcgc.tokenizer.sentence_piece_tokenizer.BioSequencePieceSettings] The settings for the tokenizer. None

encode(self, seq)

Show source code in gcgc/tokenizer/sentence_piece_tokenizer.py
172
173
174
175
176
177
    def encode(self, seq: str) -> List[int]:
        """Encode the underlying sequence into a list of tokens."""
        return [
            self.vocab.get(s, self.vocab.get(self.settings.unk_token))
            for s in self.encode_as_tokens(seq)
        ]

Encode the underlying sequence into a list of tokens.

encode_as_tokens(self, seq)

Show source code in gcgc/tokenizer/sentence_piece_tokenizer.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
    def encode_as_tokens(self, seq: str) -> List[str]:
        """Tokenize the sequence into a list of token tokens.

        Args:
            seq: The sequence to encode.

        Returns:
            The list of strs that are the tokens.

        """
        if not self.vocab:
            self.load_vocab()

        return super().apply_length_constraints(self.sp_processor.EncodeAsPieces(seq))

Tokenize the sequence into a list of token tokens.

Parameters

Name Type Description Default
seq str The sequence to encode. required

Returns

Type Description
List[str] The list of strs that are the tokens.

fit_on_fasta(self, fasta_file)

Show source code in gcgc/tokenizer/sentence_piece_tokenizer.py
112
113
114
115
116
117
118
119
120
121
122
    def fit_on_fasta(self, fasta_file: Path):
        """Run the the SP algo on the fasta_file."""
        with tempfile.TemporaryDirectory() as tmpdir:
            tmppath = Path(tmpdir)

            text_file_path = tmppath / "input_textfiles.txt"
            with text_file_path.open("w") as text_lines, fasta_file.open("r") as input_handler:
                for record in SeqIO.parse(input_handler, "fasta"):
                    text_lines.write(f"{str(record.seq)}\n")

            self.fit_on_text(text_file_path)

Run the the SP algo on the fasta_file.

fit_on_list(self, sequence_list)

Show source code in gcgc/tokenizer/sentence_piece_tokenizer.py
124
125
126
127
128
129
130
131
    def fit_on_list(self, sequence_list: List[str]):
        """Fit the SP algo on a list."""
        with tempfile.TemporaryDirectory() as tmpdir:
            tmppath = Path(tmpdir)

            text_file_path = tmppath / "input_textfiles.txt"
            text_file_path.write_text("\n".join(sequence_list))
            self.fit_on_text(text_file_path)

Fit the SP algo on a list.

fit_on_text(self, text_file)

Show source code in gcgc/tokenizer/sentence_piece_tokenizer.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    def fit_on_text(self, text_file: Path):
        """Run the the SP algo on the text_file."""
        args = [
            f"--input={str(text_file)}",
            f"--model_prefix={self.settings.model_prefix}",
            f"--vocab_size={self.settings.vocab_size}",
            f"--model_type={self.settings.model_type}",
            f"--max_sentence_length={self.settings.max_sequence_length}",
            f"--num_sub_iterations={self.settings.num_sub_iterations}",
            f"--num_threads={self.settings.num_threads}",
            f"--unk_piece={self.settings.unk_token}",
            f"--unk_id={self.settings.unk_token_id}",
        ]

        if self.settings.unk_token:
            args.extend([f"--unk_piece={self.settings.unk_token}"])
        else:
            args.extend(["--unk_id=-1"])

        if self.settings.bos_token:
            args.extend([f"--bos_piece={self.settings.bos_token}"])
        else:
            args.extend(["--bos_id=-1"])

        if self.settings.eos_token:
            args.extend([f"--eos_piece={self.settings.eos_token}"])
        else:
            args.extend(["--eos_id=-1"])

        if self.settings.pad_token:
            args.extend([f"--pad_piece={self.settings.pad_token}", "--pad_id=-1"])
            self.vocab[self.settings.pad_token] = -1
        else:
            args.extend(["--pad_id=-1"])

        spm.SentencePieceTrainer.Train(" ".join(args))

        self.load_vocab()

Run the the SP algo on the text_file.

load_vocab(self)

Show source code in gcgc/tokenizer/sentence_piece_tokenizer.py
194
195
196
197
198
    def load_vocab(self):
        """Load the vocabulary from the file."""
        for line, token in enumerate(self.settings.model_vocab.open()):
            token = token.strip("\n").split("\t")[0]
            self.vocab[token] = line

Load the vocabulary from the file.

BioSequencePieceSettings

The settings for the sentence piece model.

Like the baseclass, `SequenceTokenizerSettings`, the schema (and thus available fields), can be
seen by using the `print_schema` classmethod.

```python
>>> print(BioSequencePieceSettings.schema_json(indent=2))
{
  "title": "SequenceTokenizerSettings"
  ...
}

model_path: Path (property, readonly)

Return the model path based on the prefix.

model_vocab: Path (property, readonly)

Return the model vocab based on the prefix.