Source code for deepmol.tokenizers.tokenizer

from abc import ABC, abstractmethod

from deepmol.base import Estimator
from deepmol.datasets import Dataset
from deepmol.parallelism.multiprocessing import JoblibMultiprocessing


[docs]class Tokenizer(Estimator, ABC): """ An abstract class for tokenizers. Tokenizers are used to tokenize strings. Child classes must implement the tokenize method. """ def __init__(self, n_jobs: int) -> None: """ Initializes the tokenizer. Parameters ---------- n_jobs: int The number of jobs to run in parallel in the featurization. """ super().__init__() self.n_jobs = n_jobs
[docs] def tokenize(self, dataset: Dataset) -> list: """ Tokenizes a dataset. Parameters ---------- dataset: Dataset The dataset to tokenize. Returns ------- dataset: Dataset The tokenized dataset. """ if not self._is_fitted: raise ValueError("The tokenizer must be fitted before tokenizing a dataset. " "Call Tokenizer.fit(dataset) first.") smiles = dataset.smiles multiprocessing_cls = JoblibMultiprocessing(process=self._tokenize, n_jobs=self.n_jobs) tokens = multiprocessing_cls.run(smiles) return list(tokens)
@abstractmethod def _tokenize(self, text: str) -> list: """ Tokenizes a text. Parameters ---------- text: str The text to tokenize. Returns ------- tokens: list The list of tokens. """ @property @abstractmethod def vocabulary(self) -> list: """ Returns the vocabulary. Returns ------- vocabulary: list The vocabulary. """ @property @abstractmethod def max_length(self) -> int: """ Returns the maximum length of a tokenized string. Returns ------- max_length: int The maximum length of a tokenized string. """