From 885eaac677c2e547851759b52405567e03c802e4 Mon Sep 17 00:00:00 2001
From: mak-454 <ahmed.khan@oneconvergence.com>
Date: Mon, 9 May 2022 17:49:30 +0530
Subject: [PATCH 1/5] fix to pass gpus to strategy if found. Fixes warning
 WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not
 using nccl allreduce.

---
 mnist-distributed/mirrored_mnist.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py
index f410e7f6..a3d9174e 100644
--- a/mnist-distributed/mirrored_mnist.py
+++ b/mnist-distributed/mirrored_mnist.py
@@ -30,7 +30,8 @@ def data_loader(hyperparams):
     )
     
 def model_with_strategy(learning_rate):
-    strategy = tf.distribute.MirroredStrategy()
+    gpus = tf.config.list_logical_devices('GPU')
+    strategy = tf.distribute.MirroredStrategy(gpus)
     with strategy.scope():
         model = keras.Sequential(
                 [

From 7d496c1078ba98d33caa3cc62f576b177f8c451a Mon Sep 17 00:00:00 2001
From: mak-454 <ahmed.khan@oneconvergence.com>
Date: Mon, 9 May 2022 17:58:22 +0530
Subject: [PATCH 2/5] fix to not train in ps.

---
 mnist-distributed/mirrored_mnist.py | 36 +++++++++++++++--------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py
index a3d9174e..117cf59f 100644
--- a/mnist-distributed/mirrored_mnist.py
+++ b/mnist-distributed/mirrored_mnist.py
@@ -31,7 +31,8 @@ def data_loader(hyperparams):
     
 def model_with_strategy(learning_rate):
     gpus = tf.config.list_logical_devices('GPU')
-    strategy = tf.distribute.MirroredStrategy(gpus)
+    if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps':
+        strategy = tf.distribute.MirroredStrategy(gpus)
     with strategy.scope():
         model = keras.Sequential(
                 [
@@ -56,22 +57,23 @@ def __init__(self, learning_rate=0.01, batch_size=64, epochs=2):
         self.train_dataset, self.test_dataset = data_loader(hyperparams)
         
     def train(self):
-        model = model_with_strategy(self.learning_rate)
-        #steps per epoch are reduced here to train on limited resources
-        #you are free to remove this argument
-        history = model.fit(self.train_dataset, 
-                            epochs=self.epochs,
-                            shuffle=True,
-                            steps_per_epoch=30,
-                            validation_data=self.test_dataset,
-                            validation_steps=30,
-                            verbose=0)
-        
-        tf.saved_model.save(model,MODEL_DIR + str(1))
-        val_losses = history.history['val_loss']
-        val_accuracies = history.history['val_accuracy']
-        for epoch, val_loss, val_accuracy in zip(range(self.epochs), val_losses, val_accuracies):
-          print("epoch {}:\nval_loss={:.2f}\nval_accuracy={:.2f}\n".format(epoch + 1, val_loss, val_accuracy))
+        if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps':
+            model = model_with_strategy(self.learning_rate)i
+            #steps per epoch are reduced here to train on limited resources
+            #you are free to remove this argument
+            history = model.fit(self.train_dataset, 
+                                epochs=self.epochs,
+                                shuffle=True,
+                                steps_per_epoch=30,
+                                validation_data=self.test_dataset,
+                                validation_steps=30,
+                                verbose=0)
+            
+            tf.saved_model.save(model,MODEL_DIR + str(1))
+            val_losses = history.history['val_loss']
+            val_accuracies = history.history['val_accuracy']
+            for epoch, val_loss, val_accuracy in zip(range(self.epochs), val_losses, val_accuracies):
+              print("epoch {}:\nval_loss={:.2f}\nval_accuracy={:.2f}\n".format(epoch + 1, val_loss, val_accuracy))
           
           
 if __name__ == "__main__":

From 15a1b2f95f2f916345d70580b24ebfabf92e37cc Mon Sep 17 00:00:00 2001
From: mak-454 <ahmed.khan@oneconvergence.com>
Date: Mon, 9 May 2022 18:09:31 +0530
Subject: [PATCH 3/5] fixie.

---
 mnist-distributed/mirrored_mnist.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py
index 117cf59f..5dff983c 100644
--- a/mnist-distributed/mirrored_mnist.py
+++ b/mnist-distributed/mirrored_mnist.py
@@ -31,8 +31,7 @@ def data_loader(hyperparams):
     
 def model_with_strategy(learning_rate):
     gpus = tf.config.list_logical_devices('GPU')
-    if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps':
-        strategy = tf.distribute.MirroredStrategy(gpus)
+    strategy = tf.distribute.MirroredStrategy(gpus)
     with strategy.scope():
         model = keras.Sequential(
                 [
@@ -58,7 +57,7 @@ def __init__(self, learning_rate=0.01, batch_size=64, epochs=2):
         
     def train(self):
         if TF_CONFIG and ast.literal_eval(TF_CONFIG)['task']['type'] != 'ps':
-            model = model_with_strategy(self.learning_rate)i
+            model = model_with_strategy(self.learning_rate)
             #steps per epoch are reduced here to train on limited resources
             #you are free to remove this argument
             history = model.fit(self.train_dataset, 

From 654cf1d81efa45054c2654d841ac4a6ce567403f Mon Sep 17 00:00:00 2001
From: mak-454 <ahmed.khan@oneconvergence.com>
Date: Mon, 9 May 2022 18:11:11 +0530
Subject: [PATCH 4/5] fixie.

---
 mnist-distributed/mirrored_mnist.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py
index 5dff983c..07994559 100644
--- a/mnist-distributed/mirrored_mnist.py
+++ b/mnist-distributed/mirrored_mnist.py
@@ -11,6 +11,8 @@
 input_shape = (28, 28, 1)
 num_classes = 10
 
+TF_CONFIG = os.environ.get('TF_CONFIG')
+
 def data_loader(hyperparams):
     f = gzip.open('/mnist/mnist.pkl.gz', 'rb')
     dataset = pickle.load(f, encoding='bytes')

From a5c127069e430a5b5830e4ac9860f6212e9e7f3d Mon Sep 17 00:00:00 2001
From: mak-454 <ahmed.khan@oneconvergence.com>
Date: Mon, 9 May 2022 18:12:33 +0530
Subject: [PATCH 5/5] fixie.

---
 mnist-distributed/mirrored_mnist.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mnist-distributed/mirrored_mnist.py b/mnist-distributed/mirrored_mnist.py
index 07994559..9b0fc879 100644
--- a/mnist-distributed/mirrored_mnist.py
+++ b/mnist-distributed/mirrored_mnist.py
@@ -6,6 +6,7 @@
 import numpy as np
 import tensorflow as tf
 import argparse
+import ast
 
 MODEL_DIR = "/model/"
 input_shape = (28, 28, 1)