Source code for tfds_defect_detection

[docs]__version__ = "0.1.0"

from pathlib import Path
from typing import Optional, Iterable

import tensorflow as tf
from typing_extensions import Literal
import albumentations as A


[docs]def load(
        names: Iterable[Literal["mvtec", "visa"]] = ("mvtec", "visa"),
        data_dir=Path("anomaly_datasets"),
        pairing_mode: Literal[
            "result_only",
            "result_with_original",
            "result_with_contrastive_pair"
        ] = "result_only",
        create_artificial_anomalies=False,
        validation_split=0.2,
        subset_mode: Optional[Literal[
            "training",
            "validation",
            "test",
            "holdout"
        ]] = "training",
        drop_masks=True,
        width=256,
        height=256,
        repeat=True,
        anomaly_size: Optional[int] = None,
        process_deviation=A.Compose([]),
        global_transform=A.Compose([]),
        anomaly_composition=A.Compose([]),
        batch_size=8,
        seed=123,
        shuffle=True,
        peek=True,
        download=True,
        image_validation=False,
        crop_to_aspect_ratio=False,
        delete_tmp=True,
):
    """

    Convenience wrapper for download_and_prepare + DatasetBuilder
    Returns a tf.data.Dataset

    - ``tfds_defect_detection.downloader.download_and_prepare``
    - ``tfds_defect_detection.data.DatasetBuilder``

    Executes ``download_and_prepare`` first.
    Then builds a ``tf.data.Dataset`` according to the arguments


    Example

    .. code-block:: python

        ds = tfd.load(names=["mvtec"], data_dir=Path("."), batch_size=4

    .. WARNING::
        Warning: calling this function might potentially trigger the download
        of 30+ GiB to disk. Refer to the delete_tmp argument.

    names : ``Iterable[Literal["mvtec", "visa"]]``
        List of named datasets to load. Defaults to ["mvtec", "visa"]. Passing
        multiple dataset names returns the result of
        ``tf.data.Dataset.sample_from_datasets([dataset1, dataset2])``
    data_dir : ``pathlib.Path``
        directory to read/write data. Images cached here will be included in consecutive runs without the need for further download.
    pairing_mode : ``str``
        - "result_only" - the X variable of the dataset only holds the processed image.
        - "result_with_original" - the X variable of the dataset holds a tuple  of the processed image and original image
        - "result_with_contrastive_pair" - the X variable holds a tuple of the processed image and a random image of the same class
    create_artificial_anomalies : ``bool``
        Creates artifical anomalies on the
        processed image. Anomalies are created by copying a random polygon from the
        image and smoothly pasting it onto a different part of the image.
        You can alter the appearance of the anomaly with the 'anomaly_composition'
        and 'anomaly_size' argument. Default is True
    validation_split : ``float``
        .. list-table:: Example splits for ``validation_split=0.2`` (default)
           :widths: 15 10 30
           :header-rows: 1

           * - Split
             - Amount
             - Source
           * - Training
             - 80% (1 - 0.2) of
             - train folder
           * - Validation
             - 20% (0.2) of
             - train folder
           * - Test
             - 80% (1 - 0.2) of
             - test folder
           * - Holdout
             - 20% (0.2) of
             - test folder


    subset_mode : optional, ``str``
        - "training" - Returns the training split.
            Note that training images in
            mvtec and visa are non-anomalous images due to the
            unsupervised nature of the task. For semi-supervised
            learning see the parameter 'create_artificial_anomalies'
        - "validation" - Returns the validation split.
            (Part of the train folder). Like the training split, for VisA and
            MVTEC  this only contains defect-free images.
            Useful in combination with synthetic defects.
        - "test" - Returns the majority of the real test data.
            These images contain real defects and human annotated masks.
            You may want to set 'create_artificial_anomalies' to False.
        - "holdout" - Holdout set for final evaluation on human annotated defects.
            You may want to set 'create_artificial_anomalies' to False.
        - ``None`` - returns all of the above in a dictionary.
    drop_masks : ``bool``
        whether to drop the masks, i.e. the Y variable of the dataset.
        Useful for inference tasks.
    width : ``int``
        Width of the images in a batch
    height : ``int``
        Height of the images in a batch
    repeat: ``bool``
        Whether to infinitely repeat the data. Defaults to ``True``
    anomaly_size : optional, ``int``
        if None the anomaly size will be random with min=width/8 and max=width/4
    process_deviation : ``albumentations.Transform``
        Data augmentation of the processed images
    global_transform : ``albumentations.Transform``
        data augmentation of all images
    anomaly_composition: ``albumentations.Transform``
        data augmentation of the synthetic anomaly patches.
    batch_size : ``int``
        Size of the batches of data. Default: 8
    seed : ``int``
        Optional random seed for shuffling.
    shuffle : ``bool``
        Whether to shuffle the input files. Defaults to True.
    peek : ``bool``
        Whether to plot first batch of images of the loaded data.
        Defaults to True.
    download : [DEPRECATED] optional, ``bool``
        This variable has no longer an effec.
        Whether download is set to ``False`` or ``True``, when
        ``data_dir`` to already holds the expected folder structure from a
        previous run, the function will always try to use the cached version
    image_validation : optional, ``bool``
        Whether to open all images before calling the DatasetBuilder.
        This will print the name of corrupted image files,
        which cannot be read by tensorflow. Defaults to ``False``
    crop_to_aspect_ratio : optional, ``bool``
        If True, resize the images without aspect ratio distortion.
        When the original aspect ratio differs from the target aspect ratio,
        the output image will be cropped so, as to return the largest possible
        window in the image (of size image_size) that matches
        the target aspect ratio. By default, (crop_to_aspect_ratio=False),
        aspect ratio may not be preserved.
    delete_tmp : optional, ``bool``
        If True (default), deletes temporary versions of the datasets.
        Only keeps the processed version of each dataset.
        This means the original archive, the original dataset,
        and any intermediate processing steps are deleted. Empty folders are
        kept as a hint that these have been processed. This saves memory
        and time on successive runs. The final results are still cached,
        so you can safely rerun this function without the need to download
        again. Yet, if you want to have a look at the original datasets,
        consider disabling this parameter.

    ``tf.data.Dataset``
        the dataset requested, or if subset_mode is None,
        a dict<key: subset_mode, value: tf.data.Dataset>.
    """
    from tfds_defect_detection.downloader import download_and_prepare
    from tfds_defect_detection.data import DatasetBuilder

    kwargs = locals().copy()

    all_folders = download_and_prepare(
        cache_dir=data_dir,
        names=names,
        download=download,
        image_validation=image_validation,
        delete_tmp=delete_tmp
    )

    datasets = [
        {
            "training": lambda: DatasetBuilder(
                image_directory=train_folder,
                subset="training",
                **kwargs
            ).ds,
            "validation": lambda: DatasetBuilder(
                image_directory=train_folder,
                subset="validation",
                **kwargs
            ).ds,
            "test": lambda: DatasetBuilder(
                image_directory=test_image_folder,
                mask_directory=test_mask_folder,
                subset="training",
                **kwargs
            ).ds,
            "holdout": lambda: DatasetBuilder(
                image_directory=test_image_folder,
                mask_directory=test_mask_folder,
                subset="validation",
                **kwargs
            ).ds
        }[subset_mode]()
        for train_folder, test_image_folder, test_mask_folder in all_folders
    ]

    return tf.data.Dataset.sample_from_datasets(datasets)


if __name__ == '__main__':
    pass