from pathlib import Path
from typing import Optional, Iterable
import tensorflow as tf
from typing_extensions import Literal
import albumentations as A
[docs]def load(
names: Iterable[Literal["mvtec", "visa"]] = ("mvtec", "visa"),
data_dir=Path("anomaly_datasets"),
pairing_mode: Literal[
"result_only",
"result_with_original",
"result_with_contrastive_pair"
] = "result_only",
create_artificial_anomalies=False,
validation_split=0.2,
subset_mode: Optional[Literal[
"training",
"validation",
"test",
"holdout"
]] = "training",
drop_masks=True,
width=256,
height=256,
repeat=True,
anomaly_size: Optional[int] = None,
process_deviation=A.Compose([]),
global_transform=A.Compose([]),
anomaly_composition=A.Compose([]),
batch_size=8,
seed=123,
shuffle=True,
peek=True,
download=True,
image_validation=False,
crop_to_aspect_ratio=False,
delete_tmp=True,
):
"""
Convenience wrapper for download_and_prepare + DatasetBuilder
Returns a tf.data.Dataset
- ``tfds_defect_detection.downloader.download_and_prepare``
- ``tfds_defect_detection.data.DatasetBuilder``
Executes ``download_and_prepare`` first.
Then builds a ``tf.data.Dataset`` according to the arguments
Example
.. code-block:: python
ds = tfd.load(names=["mvtec"], data_dir=Path("."), batch_size=4
.. WARNING::
Warning: calling this function might potentially trigger the download
of 30+ GiB to disk. Refer to the delete_tmp argument.
names : ``Iterable[Literal["mvtec", "visa"]]``
List of named datasets to load. Defaults to ["mvtec", "visa"]. Passing
multiple dataset names returns the result of
``tf.data.Dataset.sample_from_datasets([dataset1, dataset2])``
data_dir : ``pathlib.Path``
directory to read/write data. Images cached here will be included in consecutive runs without the need for further download.
pairing_mode : ``str``
- "result_only" - the X variable of the dataset only holds the processed image.
- "result_with_original" - the X variable of the dataset holds a tuple of the processed image and original image
- "result_with_contrastive_pair" - the X variable holds a tuple of the processed image and a random image of the same class
create_artificial_anomalies : ``bool``
Creates artifical anomalies on the
processed image. Anomalies are created by copying a random polygon from the
image and smoothly pasting it onto a different part of the image.
You can alter the appearance of the anomaly with the 'anomaly_composition'
and 'anomaly_size' argument. Default is True
validation_split : ``float``
.. list-table:: Example splits for ``validation_split=0.2`` (default)
:widths: 15 10 30
:header-rows: 1
* - Split
- Amount
- Source
* - Training
- 80% (1 - 0.2) of
- train folder
* - Validation
- 20% (0.2) of
- train folder
* - Test
- 80% (1 - 0.2) of
- test folder
* - Holdout
- 20% (0.2) of
- test folder
subset_mode : optional, ``str``
- "training" - Returns the training split.
Note that training images in
mvtec and visa are non-anomalous images due to the
unsupervised nature of the task. For semi-supervised
learning see the parameter 'create_artificial_anomalies'
- "validation" - Returns the validation split.
(Part of the train folder). Like the training split, for VisA and
MVTEC this only contains defect-free images.
Useful in combination with synthetic defects.
- "test" - Returns the majority of the real test data.
These images contain real defects and human annotated masks.
You may want to set 'create_artificial_anomalies' to False.
- "holdout" - Holdout set for final evaluation on human annotated defects.
You may want to set 'create_artificial_anomalies' to False.
- ``None`` - returns all of the above in a dictionary.
drop_masks : ``bool``
whether to drop the masks, i.e. the Y variable of the dataset.
Useful for inference tasks.
width : ``int``
Width of the images in a batch
height : ``int``
Height of the images in a batch
repeat: ``bool``
Whether to infinitely repeat the data. Defaults to ``True``
anomaly_size : optional, ``int``
if None the anomaly size will be random with min=width/8 and max=width/4
process_deviation : ``albumentations.Transform``
Data augmentation of the processed images
global_transform : ``albumentations.Transform``
data augmentation of all images
anomaly_composition: ``albumentations.Transform``
data augmentation of the synthetic anomaly patches.
batch_size : ``int``
Size of the batches of data. Default: 8
seed : ``int``
Optional random seed for shuffling.
shuffle : ``bool``
Whether to shuffle the input files. Defaults to True.
peek : ``bool``
Whether to plot first batch of images of the loaded data.
Defaults to True.
download : [DEPRECATED] optional, ``bool``
This variable has no longer an effec.
Whether download is set to ``False`` or ``True``, when
``data_dir`` to already holds the expected folder structure from a
previous run, the function will always try to use the cached version
image_validation : optional, ``bool``
Whether to open all images before calling the DatasetBuilder.
This will print the name of corrupted image files,
which cannot be read by tensorflow. Defaults to ``False``
crop_to_aspect_ratio : optional, ``bool``
If True, resize the images without aspect ratio distortion.
When the original aspect ratio differs from the target aspect ratio,
the output image will be cropped so, as to return the largest possible
window in the image (of size image_size) that matches
the target aspect ratio. By default, (crop_to_aspect_ratio=False),
aspect ratio may not be preserved.
delete_tmp : optional, ``bool``
If True (default), deletes temporary versions of the datasets.
Only keeps the processed version of each dataset.
This means the original archive, the original dataset,
and any intermediate processing steps are deleted. Empty folders are
kept as a hint that these have been processed. This saves memory
and time on successive runs. The final results are still cached,
so you can safely rerun this function without the need to download
again. Yet, if you want to have a look at the original datasets,
consider disabling this parameter.
``tf.data.Dataset``
the dataset requested, or if subset_mode is None,
a dict<key: subset_mode, value: tf.data.Dataset>.
"""
from tfds_defect_detection.downloader import download_and_prepare
from tfds_defect_detection.data import DatasetBuilder
kwargs = locals().copy()
all_folders = download_and_prepare(
cache_dir=data_dir,
names=names,
download=download,
image_validation=image_validation,
delete_tmp=delete_tmp
)
datasets = [
{
"training": lambda: DatasetBuilder(
image_directory=train_folder,
subset="training",
**kwargs
).ds,
"validation": lambda: DatasetBuilder(
image_directory=train_folder,
subset="validation",
**kwargs
).ds,
"test": lambda: DatasetBuilder(
image_directory=test_image_folder,
mask_directory=test_mask_folder,
subset="training",
**kwargs
).ds,
"holdout": lambda: DatasetBuilder(
image_directory=test_image_folder,
mask_directory=test_mask_folder,
subset="validation",
**kwargs
).ds
}[subset_mode]()
for train_folder, test_image_folder, test_mask_folder in all_folders
]
return tf.data.Dataset.sample_from_datasets(datasets)
if __name__ == '__main__':
pass