Source code for deepobs.tolstoi.tolstoi_input

# -*- coding: utf-8 -*-
"""Tensorflow data-loading functionality for character-level language modelling
on Tolstoi's War and Peace.

Large parts of this code are adapted from github.com/sherjilozair/char-rnn-tensorflow."""

import numpy as np
import tensorflow as tf
import os

from .. import dataset_utils


[docs]class data_loading: """Class providing the data loading functionality for the Tolstoi data set. Args: batch_size (int): Batch size of the input-output pairs. No default value is given. seq_length (int): Sequence length to be model in each step. No default value is given. Attributes: batch_size (int): Batch size of the input-output pairs. seq_length (int): Sequence length to be model in each step. train_eval_size (int): Number of data points to evaluate during the `train eval` phase. Currently set to ``658725`` the size of the test set. D_train (tf.data.Dataset): The training data set. D_train_eval (tf.data.Dataset): The training evaluation data set. It is the same data as `D_train` but we go through it separately. D_test (tf.data.Dataset): The test data set. phase (tf.Variable): Variable to describe which phase we are currently in. Can be "train", "train_eval" or "test". The phase variable can determine the behaviour of the network, for example deactivate dropout during evaluation. iterator (tf.data.Iterator): A single iterator for all three data sets. We us the initialization operators (see below) to switch this iterator to the data sets. X (tf.Tensor): Tensor holding the input text of the tolstoi data set for character prediction. It has dimension `batch_size` x `seq_length`. y (tf.Tensor): Tensor holding the target text of the tolstoi data set for character prediction, i.e. the input text shifted by a single character. It has dimension `batch_size` x `seq_length`. train_init_op (tf.Operation): A TensorFlow operation to be performed before starting every training epoch. It sets the `phase` variable to "train" and initializes the iterator to the training data set. train_eval_init_op (tf.Operation): A TensorFlow operation to be performed before starting every training eval phase. It sets the `phase` variable to "train_eval" and initializes the iterator to the training eval data set. test_init_op (tf.Operation): A TensorFlow operation to be performed before starting every test evaluation phase. It sets the `phase` variable to "test" and initializes the iterator to the test data set. """ def __init__(self, batch_size, seq_length): """Initializes the data loading class. Args: batch_size (int): Batch size of the input-output pairs. seq_length (int): Sequence length to be model in each step. """ self.train_eval_size = 658725 # The size of the test set self.batch_size = batch_size self.seq_length = seq_length # Load datasets DATA_DIR = os.path.join(dataset_utils.get_data_dir(), "tolstoi") self.D_train = self.make_text_dataset(os.path.join(DATA_DIR, "train.npy"), batch_size=batch_size, seq_length=seq_length) self.D_train_eval = self.make_text_dataset(os.path.join(DATA_DIR, "train.npy"), batch_size=batch_size, seq_length=seq_length, data_set_size=self.train_eval_size) self.D_test = self.make_text_dataset(os.path.join(DATA_DIR, "test.npy"), batch_size=batch_size, seq_length=seq_length) self.phase = tf.Variable("train", name="phase", trainable=False) # Reinitializable iterator given types and shapes of the outputs (needs to be the same for train and test of course) self.iterator = tf.data.Iterator.from_structure( self.D_train.output_types, self.D_train.output_shapes) self.X, self.y = self.iterator.get_next() # Operations to do when switching the phase (initialize iterator and assign phase to phase variable) self.train_init_op = tf.group([self.iterator.make_initializer( self.D_train), tf.assign(self.phase, "train")], name="train_init_op") self.train_eval_init_op = tf.group([self.iterator.make_initializer( self.D_train_eval), tf.assign(self.phase, "train_eval")], name="train_eval_init_op") self.test_init_op = tf.group([self.iterator.make_initializer( self.D_test), tf.assign(self.phase, "test")], name="test_init_op")
[docs] def load(self): """Returns the data (`X` (input text) and `y` (output text)) and the phase variable. Returns: tupel: Tupel consisting of the input text (`X`), the output text (`y`) and the phase variable (`phase`). """ return self.X, self.y, self.phase
[docs] def make_text_dataset(self, filepath, batch_size, seq_length, num_prefetched_batches=10, data_set_size=-1): """Produce a TensorFlow dataset from the filepath to the preprocessed data set. Args: filepath (str): Path to the ``.npy`` file containing the data set. batch_size (int): Batch size of the input-output pairs. seq_length (int): Sequence length to be model in each step. num_prefetched_batches (int): Number of prefeteched batches, defaults to ``10``. data_set_size (int): Size of the data set to extract from the images and label files. Defaults to ``-1`` meaning that the full data set is used. Returns: tf.data.Dataset: Data set object containing the input and output pair. """ # Load the array of character ids, determine the number of batches that can be # produced, given batch size and sequence lengh arr = np.load(filepath) print np.shape(arr) num_batches = int(np.floor((np.size(arr) - 1) / (batch_size * seq_length))) print "Num batches:", num_batches if num_batches == 0: raise ValueError("This dataset is to small to use with this batch size " "and sequence length.") # Create input and output, where output is the text shifted by one character x = arr[:num_batches * batch_size * seq_length] y = arr[1:num_batches * batch_size * seq_length + 1] # Split into batches and put into arrays X, Y, such that X[i,:] is the i-th batch x_batches = np.split(x.reshape(batch_size, -1), num_batches, 1) y_batches = np.split(y.reshape(batch_size, -1), num_batches, 1) X = np.array(x_batches) Y = np.array(y_batches) # Create dataset from X, Y arrays D = tf.data.Dataset.from_tensor_slices((X, Y)) D = D.take(data_set_size) D = D.prefetch(buffer_size=num_prefetched_batches) return D