From 885eaac677c2e547851759b52405567e03c802e4 Mon Sep 17 00:00:00 2001 From: mak-454 Date: Mon, 9 May 2022 17:49:30 +0530 Subject: [PATCH 1/5] fix to pass gpus to strategy if found. Fixes warning WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce. --- mnist-distributed/mirrored_mnist.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py index f410e7f6..a3d9174e 100644 --- a/mnist-distributed/mirrored_mnist.py +++ b/mnist-distributed/mirrored_mnist.py @@ -30,7 +30,8 @@ def data_loader(hyperparams): ) def model_with_strategy(learning_rate): - strategy = tf.distribute.MirroredStrategy() + gpus = tf.config.list_logical_devices('GPU') + strategy = tf.distribute.MirroredStrategy(gpus) with strategy.scope(): model = keras.Sequential( [ From 7d496c1078ba98d33caa3cc62f576b177f8c451a Mon Sep 17 00:00:00 2001 From: mak-454 Date: Mon, 9 May 2022 17:58:22 +0530 Subject: [PATCH 2/5] fix to not train in ps. --- mnist-distributed/mirrored_mnist.py | 36 +++++++++++++++-------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py index a3d9174e..117cf59f 100644 --- a/mnist-distributed/mirrored_mnist.py +++ b/mnist-distributed/mirrored_mnist.py @@ -31,7 +31,8 @@ def data_loader(hyperparams): def model_with_strategy(learning_rate): gpus = tf.config.list_logical_devices('GPU') - strategy = tf.distribute.MirroredStrategy(gpus) + if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps': + strategy = tf.distribute.MirroredStrategy(gpus) with strategy.scope(): model = keras.Sequential( [ @@ -56,22 +57,23 @@ def __init__(self, learning_rate=0.01, batch_size=64, epochs=2): self.train_dataset, self.test_dataset = data_loader(hyperparams) def train(self): - model = model_with_strategy(self.learning_rate) - #steps per epoch are reduced here to train on limited resources - #you are free to remove this argument - history = model.fit(self.train_dataset, - epochs=self.epochs, - shuffle=True, - steps_per_epoch=30, - validation_data=self.test_dataset, - validation_steps=30, - verbose=0) - - tf.saved_model.save(model,MODEL_DIR + str(1)) - val_losses = history.history['val_loss'] - val_accuracies = history.history['val_accuracy'] - for epoch, val_loss, val_accuracy in zip(range(self.epochs), val_losses, val_accuracies): - print("epoch {}:\nval_loss={:.2f}\nval_accuracy={:.2f}\n".format(epoch + 1, val_loss, val_accuracy)) + if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps': + model = model_with_strategy(self.learning_rate)i + #steps per epoch are reduced here to train on limited resources + #you are free to remove this argument + history = model.fit(self.train_dataset, + epochs=self.epochs, + shuffle=True, + steps_per_epoch=30, + validation_data=self.test_dataset, + validation_steps=30, + verbose=0) + + tf.saved_model.save(model,MODEL_DIR + str(1)) + val_losses = history.history['val_loss'] + val_accuracies = history.history['val_accuracy'] + for epoch, val_loss, val_accuracy in zip(range(self.epochs), val_losses, val_accuracies): + print("epoch {}:\nval_loss={:.2f}\nval_accuracy={:.2f}\n".format(epoch + 1, val_loss, val_accuracy)) if __name__ == "__main__": From 15a1b2f95f2f916345d70580b24ebfabf92e37cc Mon Sep 17 00:00:00 2001 From: mak-454 Date: Mon, 9 May 2022 18:09:31 +0530 Subject: [PATCH 3/5] fixie. --- mnist-distributed/mirrored_mnist.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py index 117cf59f..5dff983c 100644 --- a/mnist-distributed/mirrored_mnist.py +++ b/mnist-distributed/mirrored_mnist.py @@ -31,8 +31,7 @@ def data_loader(hyperparams): def model_with_strategy(learning_rate): gpus = tf.config.list_logical_devices('GPU') - if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps': - strategy = tf.distribute.MirroredStrategy(gpus) + strategy = tf.distribute.MirroredStrategy(gpus) with strategy.scope(): model = keras.Sequential( [ @@ -58,7 +57,7 @@ def __init__(self, learning_rate=0.01, batch_size=64, epochs=2): def train(self): if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps': - model = model_with_strategy(self.learning_rate)i + model = model_with_strategy(self.learning_rate) #steps per epoch are reduced here to train on limited resources #you are free to remove this argument history = model.fit(self.train_dataset, From 654cf1d81efa45054c2654d841ac4a6ce567403f Mon Sep 17 00:00:00 2001 From: mak-454 Date: Mon, 9 May 2022 18:11:11 +0530 Subject: [PATCH 4/5] fixie. --- mnist-distributed/mirrored_mnist.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py index 5dff983c..07994559 100644 --- a/mnist-distributed/mirrored_mnist.py +++ b/mnist-distributed/mirrored_mnist.py @@ -11,6 +11,8 @@ input_shape = (28, 28, 1) num_classes = 10 +TF_CONFIG = os.environ.get('TF_CONFIG') + def data_loader(hyperparams): f = gzip.open('/mnist/mnist.pkl.gz', 'rb') dataset = pickle.load(f, encoding='bytes') From a5c127069e430a5b5830e4ac9860f6212e9e7f3d Mon Sep 17 00:00:00 2001 From: mak-454 Date: Mon, 9 May 2022 18:12:33 +0530 Subject: [PATCH 5/5] fixie. --- mnist-distributed/mirrored_mnist.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py index 07994559..9b0fc879 100644 --- a/mnist-distributed/mirrored_mnist.py +++ b/mnist-distributed/mirrored_mnist.py @@ -6,6 +6,7 @@ import numpy as np import tensorflow as tf import argparse +import ast MODEL_DIR = "/model/" input_shape = (28, 28, 1)