Source code for deepobs.quadratic.quadratic_input

# -*- coding: utf-8 -*-
"""
This module contains TensorFlow data loading functionality for a Simple N-Dimensional Noisy Quadratic Problem

        0.5* (theta - x)^T * Q * (theta - x)

where x is normally distributed with mean 0.0 and sigma given by the noise_level (default is 3.0).
"""

import tensorflow as tf
import numpy as np


[docs]class data_loading:
    """Short summary.

    Args:
        batch_size (int): Batch size. No default value is given.
        dim (int): Dimensionality of the data points and therefore the created quadratic problem. Defaults to ``100``
        train_size (int): Size of the training set. Defaults to ``1000``.
        noise_level (float): Noise level of the training set. All training points are sampled from a gaussian distribution with the noise level as the standard deviation. Defaults to ``6``.

    Attributes:
        batch_size (int): Batch size. No default value is given.
        dim (int): Dimensionality of the data points and therefore the created quadratic problem. Defaults to ``100``
        train_size (int): Size of the training set. Defaults to ``1000``.
        noise_level (float): Noise level of the training set. All training points are sampled from a gaussian distribution with the noise level as the standard deviation. Defaults to ``6``.
        D_train (tf.data.Dataset): The training data set.
        D_train_eval (tf.data.Dataset): The training evaluation data set. It is the same data as `D_train` but we go through it separately.
        D_test (tf.data.Dataset): The test data set. We use the mean of the data points. Thus, the test data set has just a single data point.
        phase (tf.Variable): Variable to describe which phase we are currently in. Can be "train", "train_eval" or "test". The phase variable can determine the behaviour of the network, for example deactivate dropout during evaluation.
        iterator (tf.data.Iterator): A single iterator for all three data sets. We us the initialization operators (see below) to switch this iterator to the data sets.
        X (tf.Tensor): Tensor holding data points. It has dimension `batch_size`.
        y (tf.Tensor): Tensor holding the labels of the data points. It has dimension `batch_size`.
        train_init_op (tf.Operation): A TensorFlow operation to be performed before starting every training epoch. It sets the `phase` variable to "train" and initializes the iterator to the training data set.
        train_eval_init_op (tf.Operation): A TensorFlow operation to be performed before starting every training eval phase. It sets the `phase` variable to "train_eval" and initializes the iterator to the training eval data set.
        test_init_op (tf.Operation): A TensorFlow operation to be performed before starting every test evaluation phase. It sets the `phase` variable to "test" and initializes the iterator to the test data set.

    """
    def __init__(self, batch_size, dim=100, train_size=1000, noise_level=6):
        """Initializes the data loading class.

        Args:
            batch_size (int): Batch size. No default value is given.
            dim (int): Dimensionality of the data points and therefore the created quadratic problem. Defaults to ``100``
            train_size (int): Size of the training set. Defaults to ``1000``.
            noise_level (float): Noise level of the training set. All training points are sampled from a gaussian distribution with the noise level as the standard deviation. Defaults to ``6``.

        """
        self.train_size = train_size  # The size of the test set
        self.batch_size = batch_size
        self.noise_level = noise_level
        self.dim = dim
        self.D_train = self.train_dataset(batch_size, dim=dim, size=train_size, noise_level=noise_level)
        self.D_train_eval = self.train_eval_dataset(batch_size, dim=dim, size=train_size, noise_level=noise_level)
        self.D_test = self.test_dataset(batch_size, dim=dim)
        self.phase = tf.Variable("train", name="phase", trainable=False)

        # Reinitializable iterator given types and shapes of the outputs (needs to be the same for train and test of course)
        self.iterator = tf.data.Iterator.from_structure(
            self.D_train.output_types, self.D_train.output_shapes)
        self.X, self.y = self.iterator.get_next()

        # Operations to do when switching the phase (initialize iterator and assign phase to phase variable)
        self.train_init_op = tf.group([self.iterator.make_initializer(
            self.D_train), tf.assign(self.phase, "train")], name="train_init_op")
        self.train_eval_init_op = tf.group([self.iterator.make_initializer(
            self.D_train_eval), tf.assign(self.phase, "train_eval")], name="train_eval_init_op")
        self.test_init_op = tf.group([self.iterator.make_initializer(
            self.D_test), tf.assign(self.phase, "test")], name="test_init_op")

[docs]    def load(self):
        """Returns the data (`X` and `y` ) and the phase variable.

        Returns:
            tupel: Tupel consisting of the data points (`X`), (`y`) and the phase variable (`phase`).

        """
        return self.X, self.y, self.phase

[docs]    def train_dataset(self, batch_size, dim, size, noise_level):
        """Creates the training data set.

        Args:
            batch_size (int): Batch size of the data points.
            dim (int): Dimensionality of each dat point.
            size (int): Size of the training data set, i.e. the number of data points in the train set.
            noise_level (float): Standard deviation of the data points around the mean. The data points are drawn from a Gaussian distribution.

        Returns:
            tf.data.Dataset: The training data set.

        """
        np.random.seed(42)
        data_x, data_y = np.random.normal(0.0, noise_level, (size, dim)), np.random.normal(0.0, noise_level, (size, dim))
        data_x = np.float32(data_x)
        data_y = np.float32(data_y)
        return self.make_dataset(data_x, data_y, batch_size, one_hot=True, shuffle=True, shuffle_buffer_size=self.train_size, num_prefetched_batches=10)

[docs]    def train_eval_dataset(self, batch_size, dim, size, noise_level):
        """Creates the train eval data set.

        Args:
            batch_size (int): Batch size of the data points.
            dim (int): Dimensionality of each dat point.
            size (int): Size of the train eval data set, i.e. the number of data points in the train eval set.
            noise_level (float): Standard deviation of the data points around the mean. The data points are drawn from a Gaussian distribution.

        Returns:
            tf.data.Dataset: The train eval data set.

        """
        np.random.seed(42)
        data_x, data_y = np.random.normal(0.0, noise_level, (size, dim)), np.random.normal(0.0, noise_level, (size, dim))
        data_x = np.float32(data_x)
        data_y = np.float32(data_y)
        return self.make_dataset(data_x, data_y, batch_size, one_hot=True, shuffle=True, shuffle_buffer_size=self.train_size, num_prefetched_batches=10)

[docs]    def test_dataset(self, batch_size, dim):
        """Creates the test data set.

        Args:
            batch_size (int): Batch size. Just a single data point is created, with the mean value of the Gaussian distributions of the training data set.
            dim (int): Dimension of the data points.

        Returns:
            tf.data.Dataset: The test data set.

        """
        # recovers the deterministic quadratic function
        data_x, data_y = np.zeros((1, dim)), np.zeros((1, dim))
        data_x = np.float32(data_x)
        data_y = np.float32(data_y)
        return self.make_dataset(data_x, data_y, batch_size, one_hot=True, shuffle=False, num_prefetched_batches=1)

[docs]    def make_dataset(self, data_x, data_y, batch_size, one_hot=True, shuffle=True, shuffle_buffer_size=10000, num_prefetched_batches=10):
        """Creates a data set from given data points.

        Args:
            data_x (np.array): Numpy array containing the ``X`` values of the data points.
            data_y (np.array): Numpy array containing the ``y`` values of the data points.
            batch_size (int): Batch size of the input-output pairs.
            one_hot (bool): Switch to turn on or off one-hot encoding of the labels. Defaults to ``True``.
            shuffle (bool):  Switch to turn on or off shuffling of the data set. Defaults to ``True``.
            shuffle_buffer_size (int): Size of the shuffle buffer. Defaults to ``10000`` the size of the `test` and `train eval` data set, meaning that they will be completely shuffled.
            num_prefetched_batches (int): Number of prefeteched batches, defaults to ``10``.

        Returns:
            tf.data.Dataset: Data set object created from the images and label files.

        """
        with tf.name_scope("quadratic"):
            with tf.device('/cpu:0'):
                D = tf.data.Dataset.from_tensor_slices((data_x, data_y))
                if shuffle:
                    D = D.shuffle(buffer_size=shuffle_buffer_size)
                D = D.batch(batch_size)
                D = D.prefetch(buffer_size=num_prefetched_batches)
                return D