How to create Window sequence for TFX timeseries

Question

I am building a TFX pipeline for timeseries with a uci open-source data https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction. As timeseries training and evaluation requires window sequence of data, i used the tf.keras.preprocessing.timeseries_dataset_from_array API within the input_fn function in the Trainer.py module file to create the window for training and evaluation but i got a TypeError: dataset length is infinite.

Trainer.py module file contains below code:

from typing import List, Text
import os
import absl
import datetime
import tensorflow as tf
import tensorflow_transform as tft
from tfx.components.trainer.executor import TrainerFnArgs
from tfx.components.trainer.fn_args_utils import DataAccessor
from tfx_bsl.tfxio import dataset_options
LABEL_KEY = 'Appliances'
_DENSE_FLOAT_FEATURE_KEYS = ['lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2']
def _transformed_name(key):
    return key + '_xf'
def _transformed_names(keys):
    return [_transformed_name(key) for key in keys]
def _get_serve_tf_examples_fn(model, tf_transform_output):
    """Returns a function that parses a serialized tf.Example and applies TFT."""
model.tft_layer = tf_transform_output.transform_features_layer()

@tf.function
def serve_tf_examples_fn(serialized_tf_examples):
    &quot;&quot;&quot;Returns the output to be used in the serving signature.&quot;&quot;&quot;
    feature_spec = tf_transform_output.raw_feature_spec()
    feature_spec.pop(LABEL_KEY)
    parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
    transformed_features = model.tft_layer(parsed_features)
    return model(transformed_features)

return serve_tf_examples_fn



**def _input_fn(file_pattern: List[Text],
              data_accessor: DataAccessor,
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
    import numpy as np
    """Generates features and label for tuning/training.
Args:
file_pattern: List of paths or patterns of input tfrecord files.
data_accessor: DataAccessor for converting input to RecordBatch.
tf_transform_output: A TFTransformOutput.
batch_size: representing the number of consecutive elements of returned
  dataset to combine in a single batch

Returns:
A dataset that contains (features, indices) tuple where features is a
  dictionary of Tensors, and indices is a single Tensor of label indices.
&quot;&quot;&quot;
dataset = data_accessor.tf_dataset_factory(
  file_pattern,
  dataset_options.TensorFlowDatasetOptions(
      batch_size=batch_size, label_key=_transformed_name(LABEL_KEY)),
  tf_transform_output.transformed_metadata.schema)

sequence_generator = tf.keras.preprocessing.timeseries_dataset_from_array(
dataset, dataset, sequence_length=144, batch_size=1)


return sequence_generator**



def _build_keras_model(hidden_units: List[int] = None) -> tf.keras.Model:
    """Creates a DNN Keras model for classifying taxi data.
Args:
hidden_units: [int], the layer sizes of the DNN (input layer first).

Returns:
A keras Model.
&quot;&quot;&quot;
real_valued_columns = [
  tf.feature_column.numeric_column(key, shape=())
  for key in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS)
]

model = regressor_model(
  deep_columns=real_valued_columns,
  dnn_hidden_units=hidden_units or [100, 70, 50, 25])

return model



def regressor_model(deep_columns, dnn_hidden_units):
    """Build a simple keras regressor model.
Args:
deep_columns: Feature columns for deep part of the model.
dnn_hidden_units: [int], the layer sizes of the hidden DNN.

Returns:
A Deep Keras model
&quot;&quot;&quot;
# Following values are hard coded for simplicity in this example,
# However prefarably they should be passsed in as hparams.

# Keras needs the feature definitions at compile time.
input_layers = {
  colname: tf.keras.layers.Input(name=colname, shape=(), dtype=tf.float32)
  for colname in _transformed_names(_DENSE_FLOAT_FEATURE_KEYS)
}

deep = tf.keras.layers.DenseFeatures(deep_columns)(input_layers)
for numnodes in dnn_hidden_units:
    deep = tf.keras.layers.Dense(numnodes)(deep)


    output = tf.keras.layers.Dense(1)(deep)

    model = tf.keras.Model(input_layers, output)
    model.compile(
      loss='mean_absolute_error',
      optimizer=tf.keras.optimizers.Adam(lr=0.001),
      metrics=[tf.keras.metrics.MeanAbsoluteError()])
    model.summary(print_fn=absl.logging.info)
return model



TFX Trainer will call this function.
def run_fn(fn_args: TrainerFnArgs):
    """Train the model based on given args.
Args:
fn_args: Holds args used to train the model as name/value pairs.
&quot;&quot;&quot;
# Number of nodes in the first layer of the DNN
first_dnn_layer_size = 100
num_dnn_layers = 4
dnn_decay_factor = 0.7

sequence_length = 144
batch_size = 1

tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

train_dataset = _input_fn(fn_args.train_files, fn_args.data_accessor, 
                        tf_transform_output, 40)

print('fn_args.train_files:', fn_args.train_files)
print('fn_args.data_accessor:', fn_args.data_accessor)
print('tf_transform_output:', tf_transform_output)
print('train_dataset:')
#for example in train_dataset.take(1):
#  print(example)

eval_dataset = _input_fn(fn_args.eval_files, fn_args.data_accessor, 
                       tf_transform_output, 40)

print('fn_args.eval_files:', fn_args.eval_files)
print('fn_args.data_accessor:', fn_args.data_accessor)
print('tf_transform_output:', tf_transform_output)
print('eval_dataset:', eval_dataset)

model = _build_keras_model(
  # Construct layers sizes with exponetial decay
  hidden_units=[
      max(2, int(first_dnn_layer_size * dnn_decay_factor**i))
      for i in range(num_dnn_layers)
  ])

tensorboard_callback = tf.keras.callbacks.TensorBoard(
  log_dir=fn_args.model_run_dir, update_freq='batch')
model.fit(
  train_dataset,
  steps_per_epoch=fn_args.train_steps,
  validation_data=eval_dataset,
  validation_steps=fn_args.eval_steps,
  callbacks=[tensorboard_callback])

signatures = {
  'serving_default':
      _get_serve_tf_examples_fn(model,
                                tf_transform_output).get_concrete_function(
                                    tf.TensorSpec(
                                        shape=[None],
                                        dtype=tf.string,
                                        name='examples')),
}
model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)

Trainer configuration:

trainer = Trainer(
    module_file=os.path.abspath(trainer_module_file),
    custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
    examples=transform.outputs['transformed_examples'],
    transform_graph=transform.outputs['transform_graph'],
    schema=schema_gen.outputs['schema'],
    train_args=trainer_pb2.TrainArgs(num_steps=10000),
    eval_args=trainer_pb2.EvalArgs(num_steps=5000))
context.run(trainer)

Error:

--------------------------------------------------------------------------- 
TypeError Traceback (most recent call last) <ipython-input-53-cc5cfdc341eb> in <module>() 
7 train_args=trainer_pb2.TrainArgs(num_steps=10000), 
8 eval_args=trainer_pb2.EvalArgs(num_steps=5000))
 ----> 9 context.run(trainer) 
/usr/local/lib/python3.6/dist-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run_if_ipython(*args, **kwargs) 
65 # __IPYTHON__ variable is set by IPython, see 
66 # https://ipython.org/ipython-doc/rel-0.10.2/html/interactive/reference.html#embedding-ipython. ---> 67 return fn(*args, **kwargs) 
68 else: 
69 absl.logging.warning(
 /usr/local/lib/python3.6/dist-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run(self, component, enable_cache, beam_pipeline_args) 
180 telemetry_utils.LABEL_TFX_RUNNER: runner_label, 
181 }): 
--> 182 execution_id = launcher.launch().execution_id 
183 
184 return execution_result.ExecutionResult( 
/usr/local/lib/python3.6/dist-packages/tfx/orchestration/launcher/base_component_launcher.py in launch(self) 
203 execution_decision.input_dict, 
204 execution_decision.output_dict, 
--> 205 execution_decision.exec_properties) 
206 
207 absl.logging.info('Running publisher for %s', /usr/local/lib/python3.6/dist-packages/tfx/orchestration/launcher/in_process_component_launcher.py in _run_executor(self, execution_id, input_dict, output_dict, exec_properties) 
65 executor_context) # type: ignore 
66 
---> 67 executor.Do(input_dict, output_dict, exec_properties)
/usr/local/lib/python3.6/dist-packages/tfx/components/trainer/executor.py in Do(self, input_dict, output_dict, exec_properties) 
217 # Train the model 
218 absl.logging.info('Training model.') 
--> 219 run_fn(fn_args) 
220 
221 # Note: If trained with multi-node distribution workers, it is the user /content/trainer.py in run_fn(fn_args) 
160 
161 train_dataset = _input_fn(fn_args.train_files, fn_args.data_accessor, 
--> 162 tf_transform_output, 40) 
163 
164 print('fn_args.train_files:', fn_args.train_files) /content/trainer.py in _input_fn(file_pattern, data_accessor, tf_transform_output, batch_size) 
70 
71 dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
 ---> 72 dataset, dataset, sequence_length=144, batch_size=1) 
73 
74 #label_key=_transformed_name(LABEL_KEY) 
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/preprocessing/timeseries.py in timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride, sampling_rate, batch_size, shuffle, seed, start_index, end_index) 
117 """ 
118 # Validate the shape of data and targets --> 
119 if targets is not None and len(targets) != len(data): 
120 raise ValueError('Expected data and targets to have the same number of ' 
121 'time steps (axis 0) but got '
/usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py in len(self) 
443 length = self.cardinality() 
444 if length.numpy() == INFINITE: --> 
445 raise TypeError("dataset length is infinite.") 
446 if length.numpy() == UNKNOWN: 
447 raise TypeError("dataset length is unknown.")
TypeError: dataset length is infinite.

How to create Window sequence for TFX timeseries

TFX Trainer will call this function.

0 Answers0