From 820e426d4472d03bb5ea807209ad102d4b754d9d Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Mon, 11 Jun 2018 21:19:45 +0300
Subject: [PATCH 01/64] 'Declare' & skeleton for 'function_transformation'

- 1.3 version doesn't have the 'CustomGraphOptimizer' feature,
  so I 'manually' registered the optimizer.

- 'function' optimizer does not exist either in 1.3, so
  'function_transformation' took its place.

- 'TESTS' directory contains some toy examples of recursion.
---
 TESTS/factorial.py                            |  25 +++
 TESTS/fib.py                                  |  27 +++
 TESTS/func.py                                 |  25 +++
 TESTS/hello.py                                |   5 +
 TESTS/rec.py                                  |  25 +++
 tensorflow/core/grappler/optimizers/BUILD     |  17 ++
 .../optimizers/function_transformation.cc     | 157 ++++++++++++++++++
 .../optimizers/function_transformation.h      |  44 +++++
 .../grappler/optimizers/meta_optimizer.cc     |  13 +-
 tensorflow/core/grappler/utils/BUILD          |  17 ++
 tensorflow/core/grappler/utils/functions.cc   | 140 ++++++++++++++++
 tensorflow/core/grappler/utils/functions.h    |  39 +++++
 .../core/protobuf/rewriter_config.proto       |   2 +
 tensorflow/python/framework/function.py       |  56 +++++++
 14 files changed, 591 insertions(+), 1 deletion(-)
 create mode 100644 TESTS/factorial.py
 create mode 100644 TESTS/fib.py
 create mode 100644 TESTS/func.py
 create mode 100644 TESTS/hello.py
 create mode 100644 TESTS/rec.py
 create mode 100644 tensorflow/core/grappler/optimizers/function_transformation.cc
 create mode 100644 tensorflow/core/grappler/optimizers/function_transformation.h
 create mode 100644 tensorflow/core/grappler/utils/functions.cc
 create mode 100644 tensorflow/core/grappler/utils/functions.h

diff --git a/TESTS/factorial.py b/TESTS/factorial.py
new file mode 100644
index 0000000000..b4487e9686
--- /dev/null
+++ b/TESTS/factorial.py
@@ -0,0 +1,25 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+fac = function.Declare("Fac", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="Fac", out_names=["ret"])
+def FacImpl(n):
+	return tf.cond(tf.less_equal(n, 1),
+		lambda: tf.constant(1),
+		lambda: n * fac(n - 1))
+
+
+FacImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+result = fac(n)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+print(sess.run(result, feed_dict={n: 3}))
+
+writer.close()
+
+sess.close()
\ No newline at end of file
diff --git a/TESTS/fib.py b/TESTS/fib.py
new file mode 100644
index 0000000000..f4adf29944
--- /dev/null
+++ b/TESTS/fib.py
@@ -0,0 +1,27 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+fib = function.Declare("Fib", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="Fib", out_names=["ret"])
+def FibImpl(n):
+	return tf.cond(tf.less_equal(n, 1),
+		lambda: tf.constant(1),
+		lambda: fib(n-1))
+
+FibImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+res = fib(n)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+
+print(tf.get_default_graph().as_graph_def())
+
+
+writer.close()
+#print(sess.run(res, feed_dict={n: 0}))
+
+sess.close()
diff --git a/TESTS/func.py b/TESTS/func.py
new file mode 100644
index 0000000000..ab0f62f546
--- /dev/null
+++ b/TESTS/func.py
@@ -0,0 +1,25 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+@function.Defun(tf.float32, tf.float32)
+def MyFunc(x, y):
+	return x + y, x - y
+
+
+# Building the graph.
+
+a = tf.constant([4.0], name="a")
+b = tf.placeholder(tf.float32, name="MyPlaceHolder")
+
+add = tf.add(a, b, name="add")
+sub = tf.subtract(a, b, name="sub")
+
+c, d = MyFunc(add, sub, name='mycall')
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session() as sess:			# no need to manually close the session
+	print(sess.run([add, sub], feed_dict={b:1}))
+	print(sess.run([c,d], feed_dict={b:1}))
+
+writer.close()
diff --git a/TESTS/hello.py b/TESTS/hello.py
new file mode 100644
index 0000000000..c804b5f983
--- /dev/null
+++ b/TESTS/hello.py
@@ -0,0 +1,5 @@
+# Python
+import tensorflow as tf
+hello = tf.constant('Hello, TensorFlow!')
+sess = tf.Session()
+print(sess.run(hello))
diff --git a/TESTS/rec.py b/TESTS/rec.py
new file mode 100644
index 0000000000..a0afc23f22
--- /dev/null
+++ b/TESTS/rec.py
@@ -0,0 +1,25 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+fib = function.Declare("Fib", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="Fib", out_names=["ret"])
+def FibImpl(n):
+	return tf.cond(tf.less_equal(n, 1),
+		lambda: tf.constant(1),
+		lambda: fib(n - 1))
+# + fib(n - 2))
+
+FibImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+result = fib(n)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+print(sess.run(result, feed_dict={n: 2}))
+
+writer.close()
+
+sess.close()
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index c4def6cf23..794cd4b867 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -126,6 +126,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "function_transformation",
+    srcs = ["function_transformation.cc"],
+    hdrs = [
+        "function_transformation.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler/utils:functions",
+    ],
+)
+
 cc_library(
     name = "graph_rewriter",
     srcs = ["graph_rewriter.cc"],
@@ -304,6 +320,7 @@ cc_library(
         ":arithmetic_optimizer",
         ":auto_parallel",
         ":constant_folding",
+        ":function_transformation",
         ":graph_optimizer",
         ":layout_optimizer",
         ":memory_optimizer",
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
new file mode 100644
index 0000000000..9b7e8eacf2
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -0,0 +1,157 @@
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/function_transformation.h"
+#include <unordered_map>
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+
+namespace tensorflow {
+	namespace grappler {
+
+		Status InlineFunction(const NodeDef& node, const FunctionDef& func,
+							  GraphDef* graph) {
+		 /*
+		  const std::unordered_map<string, AttrValue> attr(node.attr().begin(),
+														   node.attr().end());
+		  FunctionDefLibrary library;
+		  std::unique_ptr<GrapplerItem> item =
+				  GrapplerItemFromFunctionDef(func, attr, library);
+
+		  std::unordered_map<string, int> input_nodes;
+		  for (int i = 0; i < func.signature().input_arg_size(); i) {
+			const OpDef::ArgDef& arg = func.signature().input_arg(i);
+			input_nodes[arg.name()] = i;
+		  }
+
+		  // Add an IdentityN op to hook the function inputs to: this ensures that
+		  // they're all evaluated before the evaluation of the function body starts.
+		  NodeDef* func_inputs = graph->add_node();
+		  func_inputs->set_name(strings::StrCat(node.name(), "/", "inlined_inputs"));
+		  func_inputs->set_op("IdentityN");
+		  *func_inputs->mutable_input() = node.input();
+		  AttrValue::ListValue* type_list =
+				  (*func_inputs->mutable_attr())["T"].mutable_list();
+		  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
+			auto it = attr.find(arg.type_attr());
+			if (it == attr.end()) {
+			  return errors::InvalidArgument("Invalid input argument ", arg.name(),
+											 " for function ", node.op(),
+											 " instantiated by ", node.name());
+			}
+			type_list->add_type(it->second.type());
+		  }
+
+		  for (NodeDef& func_body_node : *item->graph.mutable_node()) {
+			if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+			  // Turn input placeholders into identity nodes
+			  if (IsPlaceholder(func_body_node)) {
+				func_body_node.set_op("Identity");
+			  }
+			  CHECK_EQ(0, func_body_node.input_size());
+			  int input_id = input_nodes[func_body_node.name()];
+			  func_body_node.add_input(
+					  strings::StrCat(func_inputs->name(), ":", input_id));
+			} else {
+			  // Update the input names.
+			  for (string& input : *func_body_node.mutable_input()) {
+				input = strings::StrCat(node.name(), "/", input);
+			  }
+			}
+
+			// Add the node name as a prefix to avoid collisions after inlining
+			func_body_node.set_name(
+					strings::StrCat(node.name(), "/", func_body_node.name()));
+
+			// Move the node to the main graph
+			graph->add_node()->Swap(&func_body_node);
+		  }
+
+		  // Add an IdentityN op to hook the function outputs to: this ensures that the
+		  // function body is fully evaluated before its fanout gets scheduled.
+		  NodeDef* func_outputs = graph->add_node();
+		  func_outputs->set_name(node.name());
+		  func_outputs->set_op("IdentityN");
+		  type_list = (*func_outputs->mutable_attr())["T"].mutable_list();
+		  for (const OpDef::ArgDef& arg : func.signature().output_arg()) {
+			auto it = attr.find(arg.type_attr());
+			if (it == attr.end()) {
+			  return errors::InvalidArgument("Invalid output argument ", arg.name(),
+											 " for function ", node.op(),
+											 " instantiated by ", node.name());
+			}
+			type_list->add_type(it->second.type());
+			func_outputs->add_input(strings::StrCat(node.name(), "/", arg.name()));
+		  }
+
+		  */
+		  return Status::OK();
+		}
+
+		Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
+										   GraphDef* optimized_graph) {
+
+		  printf("Function Transformation - enabled by default\n");
+		  *optimized_graph = item.graph;
+
+		  /*
+		  std::unordered_map<string, const FunctionDef*> functions;
+		  for (const FunctionDef& func : item.graph.library().function()) {
+			if (func.attr().count("_noinline") == 0) {
+			  functions[func.signature().name()] = &func;
+			}
+		  }
+
+		  // Nothing to do.
+		  if (functions.empty()) {
+			*optimized_graph = item.graph;
+			return Status::OK();
+		  }
+
+		  // Inline functions when possible.
+		  for (const NodeDef& node : item.graph.node()) {
+			auto it = functions.find(node.op());
+			if (it == functions.end()) {
+			  *optimized_graph->add_node() = node;
+			} else {
+			  TF_RETURN_IF_ERROR(InlineFunction(node, *it->second, optimized_graph));
+			}
+		  }
+
+		  // TODO(bsteiner): specialize the implementation of functions that can't be
+		  // inlined based on the context in which they're instantiated.
+
+		  // TODO(bsteiner): trim the library to remove unused function definitions
+		  *optimized_graph->mutable_library() = item.graph.library();
+		  *optimized_graph->mutable_versions() = item.graph.versions();
+
+		   */
+		  return Status::OK();
+		}
+
+		void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
+										 const GraphDef& optimized_graph,
+										 double result) {
+		  // Nothing to do for FunctionTransformation.
+		}
+
+	}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.h b/tensorflow/core/grappler/optimizers/function_transformation.h
new file mode 100644
index 0000000000..514b55e0df
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/function_transformation.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_TRANSFORMATION_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_TRANSFORMATION_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+  namespace grappler {
+
+// Replace function calling nodes with pairs of new 'Call/Return' operators
+// operations to make the overall graph more efficient.
+
+	class FunctionTransformation : public GraphOptimizer {
+	public:
+		FunctionTransformation() {}
+		~FunctionTransformation() override {}
+
+		string name() const override { return "function_transformation"; };
+
+		Status Optimize(Cluster* cluster, const GrapplerItem& item,
+						GraphDef* optimized_graph) override;
+
+		void Feedback(Cluster* cluster, const GrapplerItem& item,
+					  const GraphDef& optimized_graph, double result) override;
+	};
+
+  }  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_TRANSFORMATION_H_
\ No newline at end of file
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 6718d2d739..39a004c601 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/optimizers/function_transformation.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -36,6 +37,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
   if (optimizer == "pruning") {
     graph_optimizer.reset(new ModelPruner());
   }
+  if (optimizer == "function_transformation") {
+	graph_optimizer.reset(new FunctionTransformation());
+  }
   if (optimizer == "constfold") {
     graph_optimizer.reset(new ConstantFolding(cpu_device_));
   }
@@ -62,6 +66,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!cfg_.disable_model_pruning()) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
+	if (cfg_.function_transformation() != RewriterConfig::OFF) {
+	  optimizers.push_back(
+			  std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));
+	}
     if (cfg_.constant_folding() != RewriterConfig::OFF) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
@@ -92,6 +100,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   } else {
     std::set<string> available_optimizers = {"pruning",      "constfold",
+											 "function_transformation",
                                              "layout",       "memory",
                                              "autoparallel", "arithmetic"};
     for (const auto& optimizer : cfg_.optimizers()) {
@@ -136,7 +145,9 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 }
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  return !cfg.disable_model_pruning() || cfg.optimize_tensor_layout() ||
+  return !cfg.disable_model_pruning() ||
+		 cfg.function_transformation() != RewriterConfig::OFF ||
+		 cfg.optimize_tensor_layout() ||
          cfg.constant_folding() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
          cfg.auto_parallel().enable() || cfg.memory_optimization() > 1 ||
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index bb161bf9a4..e5b916d170 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -97,3 +97,20 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "functions",
+    srcs = [
+        "functions.cc",
+    ],
+    hdrs = ["functions.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+#        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
new file mode 100644
index 0000000000..369e0003c7
--- /dev/null
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/grappler/utils/functions.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+	namespace grappler {
+
+		std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+				const FunctionDef& func,
+				const std::unordered_map<string, AttrValue>& func_attr,
+				const FunctionDefLibrary& library) {
+		  if (func.signature().name().empty()) {
+			LOG(ERROR) << "function name must be specified.";
+			return nullptr;
+		  }
+		  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
+		  new_item->id = func.signature().name();
+
+		  std::unordered_map<string, string> port_map;
+
+		  // Add the function inputs as placeholder
+		  for (const auto& inp : func.signature().input_arg()) {
+			NodeDef* ph = new_item->graph.add_node();
+			ph->set_name(inp.name());
+			ph->set_op("Placeholder");
+			if (inp.type() != DT_INVALID) {
+			  (*ph->mutable_attr())["T"].set_type(inp.type());
+			} else {
+			  auto it = func_attr.find(inp.type_attr());
+			  if (it == func_attr.end()) {
+				LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
+						   << " for function input " << inp.name();
+				return nullptr;
+			  } else {
+				(*ph->mutable_attr())["T"] = it->second;
+			  }
+			}
+			port_map[inp.name()] = inp.name();
+		  }
+
+		  // Add the function body to the graph.
+		  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+
+		  for (const NodeDef& node : func.node_def()) {
+			NodeDef* new_node = new_item->graph.add_node();
+			*new_node = node;
+			// Replace the placeholder attribute values with the specified value.
+			for (auto& attr : *new_node->mutable_attr()) {
+			  const string& ph_name = attr.second.placeholder();
+			  auto it = func_attr.find(ph_name);
+			  if (it != func_attr.end()) {
+				attr.second = it->second;
+			  }
+			}
+
+			// Functions use a custom format to encode connectivity. Map these custom
+			// strings to regular ones.
+			const OpRegistrationData* registration;
+			Status status = func_def.LookUp(node.op(), &registration);
+			if (!status.ok()) {
+			  LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
+			  return nullptr;
+			}
+
+			tensorflow::NameRangeMap inputs;
+			tensorflow::NameRangeMap outputs;
+			status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
+												   &outputs);
+			if (!status.ok()) {
+			  LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
+			  return nullptr;
+			}
+			for (const auto& name_range : outputs) {
+			  string port_prefix =
+					  strings::StrCat(node.name(), ":", name_range.first, ":");
+			  int index_start = name_range.second.first;
+			  int index_end = name_range.second.second;
+			  for (int i = index_start; i < index_end; i) {
+				string port_id = strings::StrCat(port_prefix, i - index_start);
+				string port_name = strings::StrCat(node.name(), ":", i);
+				port_map[port_id] = port_name;
+			  }
+			}
+		  }
+
+		  for (auto& node : *new_item->graph.mutable_node()) {
+			// Rewrite the inputs to use the normal naming convention.
+			for (int i = 0; i < node.input_size(); i) {
+			  const string& input = node.input(i);
+			  if (IsControlInput(input)) {
+				// No need to remap control dependencies.
+				continue;
+			  } else {
+				auto it = port_map.find(input);
+				if (it == port_map.end()) {
+				  LOG(ERROR) << "Unknown input: " << input;
+				  return nullptr;
+				}
+				node.set_input(i, it->second);
+			  }
+			}
+		  }
+
+		  // Add the function outputs to the list of fetch nodes.
+		  for (const auto& out : func.signature().output_arg()) {
+			new_item->fetch.emplace_back(out.name());
+		  }
+		  // Add the function inputs to the list of feeds.
+		  for (const auto& inp : func.signature().input_arg()) {
+			new_item->feed.emplace_back(inp.name(), Tensor());
+		  }
+
+		  return new_item;
+		}
+
+	}  // end namespace grappler
+}  // end namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
new file mode 100644
index 0000000000..04633282a7
--- /dev/null
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
+#define TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+
+	namespace grappler {
+
+// Factory method for creating a GrapplerItem from a FunctionDef.
+// Returns nullptr if the given function def cannot be converted.
+		std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+				const FunctionDef& func,
+				const std::unordered_map<string, AttrValue>& func_attr,
+				const FunctionDefLibrary& library);
+
+	}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
\ No newline at end of file
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 86ec1854fb..6d60628043 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -31,6 +31,8 @@ message RewriterConfig {
   Toggle constant_folding = 3;
   // Arithmetic optimizations (default is OFF)
   Toggle arithmetic_optimization = 7;
+  // Function transformation (default is ON).
+  Toggle function_transformation = 10;
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index b8ab16963e..e59082114e 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -27,6 +27,7 @@
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
+from tensorflow.core.framework import op_def_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_to_function_def
@@ -185,6 +186,61 @@ def __call__(self, func):
         **self._extra_kwargs)
 
 
+class Declare(object):
+  """Declares a TensorFlow function.
+
+  The object represents a TensorFlow function which will be defined
+  later during a graph construction.
+
+  For example,
+    # Declares a function Foo, which takes a tf.int32 named "n" and a
+    # tf.float32 named "x" as inputs and returns a tf.float32 named "z"
+    # as its output.
+    foo = Declare("Foo", [("n", tf.int32), ("x", tf.float32)],
+                  [("z", tf.float32)])
+
+    # Defines a function Bar calls Foo.
+    @tf.Defun(tf.float32)
+    def Bar(x):
+      return foo(6, x)
+
+    # Defines Foo, with output named "z".
+    @tf.Defun(tf.int32, tf.float32, out_names=["z"])
+    def Foo(n, x):
+       ...  # Calculation.
+       return result
+  """
+
+
+  def __init__(self, func_name, inputs, outputs):
+    """Creates a `Declare` object.
+
+    Args:
+      func_name: The name of the function.
+      inputs: A list of (name, data type) pairs of function arguments.
+      outputs: A list of (name, data type) pairs of function return values.
+    """
+    self._sig = op_def_pb2.OpDef()
+    self._sig.name = func_name
+
+    def _to_argdef_list(args):
+      names = [n for n, t in args]
+      if len(names) != len(set(names)):
+        raise ValueError("Expected names to all be unique: %s" % str(names))
+      return [
+        op_def_pb2.OpDef.ArgDef(type=t.as_datatype_enum, name=n)
+        for n, t in args
+      ]
+
+    self._sig.input_arg.extend(_to_argdef_list(inputs))
+    self._sig.output_arg.extend(_to_argdef_list(outputs))
+
+  def __call__(self, *inputs, **kwargs):
+    inputs = [ops.convert_to_tensor(_) for _ in inputs]
+    return _call(self._sig, *inputs, **kwargs)[0]
+
+
+
 class _DefinedFunction(object):
   """_DefinedFunction encapsulates a function definition and its properties.
 

From ff23c617f7fb80d533dd2d056ec99484b6313efc Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 14 Jun 2018 01:45:45 +0300
Subject: [PATCH 02/64] Added the inlining feature - writing the optimized
 graph within the core as an 'event',   so that I can visualize it with
 tensorboard.

---
 TESTS/factorial.py                            |   6 +-
 TESTS/func.py                                 |   5 +-
 TESTS/while.py                                |  14 +
 tensorflow/core/grappler/optimizers/BUILD     |   3 +
 .../optimizers/function_transformation.cc     | 540 +++++++++++++-----
 tensorflow/core/grappler/utils/functions.cc   | 233 ++++----
 tensorflow/core/grappler/utils/functions.h    |  12 +-
 7 files changed, 563 insertions(+), 250 deletions(-)
 create mode 100644 TESTS/while.py

diff --git a/TESTS/factorial.py b/TESTS/factorial.py
index b4487e9686..fa7adb6cbc 100644
--- a/TESTS/factorial.py
+++ b/TESTS/factorial.py
@@ -13,7 +13,9 @@ def FacImpl(n):
 FacImpl.add_to_graph(tf.get_default_graph())
 
 n = tf.placeholder(tf.int32, shape=[])
-result = fac(n)
+x = tf.add(n, 1)
+result = fac(x)
+y = tf.add(result, 1)
 
 writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
@@ -22,4 +24,4 @@ def FacImpl(n):
 
 writer.close()
 
-sess.close()
\ No newline at end of file
+sess.close()
diff --git a/TESTS/func.py b/TESTS/func.py
index ab0f62f546..a0cbd065d5 100644
--- a/TESTS/func.py
+++ b/TESTS/func.py
@@ -16,10 +16,11 @@ def MyFunc(x, y):
 
 c, d = MyFunc(add, sub, name='mycall')
 
-writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
 with tf.Session() as sess:			# no need to manually close the session
 	print(sess.run([add, sub], feed_dict={b:1}))
 	print(sess.run([c,d], feed_dict={b:1}))
+	#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
-writer.close()
+#writer.close()
diff --git a/TESTS/while.py b/TESTS/while.py
new file mode 100644
index 0000000000..8ca58ad6b4
--- /dev/null
+++ b/TESTS/while.py
@@ -0,0 +1,14 @@
+import tensorflow as tf
+
+n = tf.constant(4)
+res  = tf.while_loop(lambda i, n: i > 0, lambda i, n: (i-1, n*2), [4, 1])
+
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+result = sess.run([res])
+print(result)
+
+writer.close()
+sess.close()
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 794cd4b867..cb3ca9d988 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -135,9 +135,12 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 9b7e8eacf2..9f31715a68 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,142 +15,423 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/function_transformation.h"
 #include <unordered_map>
+#include "tensorflow/core/util/event.pb.h"
+#include "tensorflow/core/util/events_writer.h"
+
+
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 
 namespace tensorflow {
-	namespace grappler {
-
-		Status InlineFunction(const NodeDef& node, const FunctionDef& func,
-							  GraphDef* graph) {
-		 /*
-		  const std::unordered_map<string, AttrValue> attr(node.attr().begin(),
-														   node.attr().end());
-		  FunctionDefLibrary library;
-		  std::unique_ptr<GrapplerItem> item =
-				  GrapplerItemFromFunctionDef(func, attr, library);
-
-		  std::unordered_map<string, int> input_nodes;
-		  for (int i = 0; i < func.signature().input_arg_size(); i) {
-			const OpDef::ArgDef& arg = func.signature().input_arg(i);
-			input_nodes[arg.name()] = i;
-		  }
-
-		  // Add an IdentityN op to hook the function inputs to: this ensures that
-		  // they're all evaluated before the evaluation of the function body starts.
-		  NodeDef* func_inputs = graph->add_node();
-		  func_inputs->set_name(strings::StrCat(node.name(), "/", "inlined_inputs"));
-		  func_inputs->set_op("IdentityN");
-		  *func_inputs->mutable_input() = node.input();
-		  AttrValue::ListValue* type_list =
-				  (*func_inputs->mutable_attr())["T"].mutable_list();
-		  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-			auto it = attr.find(arg.type_attr());
-			if (it == attr.end()) {
-			  return errors::InvalidArgument("Invalid input argument ", arg.name(),
-											 " for function ", node.op(),
-											 " instantiated by ", node.name());
-			}
-			type_list->add_type(it->second.type());
-		  }
-
-		  for (NodeDef& func_body_node : *item->graph.mutable_node()) {
-			if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
-			  // Turn input placeholders into identity nodes
-			  if (IsPlaceholder(func_body_node)) {
-				func_body_node.set_op("Identity");
-			  }
-			  CHECK_EQ(0, func_body_node.input_size());
-			  int input_id = input_nodes[func_body_node.name()];
-			  func_body_node.add_input(
-					  strings::StrCat(func_inputs->name(), ":", input_id));
-			} else {
-			  // Update the input names.
-			  for (string& input : *func_body_node.mutable_input()) {
-				input = strings::StrCat(node.name(), "/", input);
-			  }
-			}
-
-			// Add the node name as a prefix to avoid collisions after inlining
-			func_body_node.set_name(
-					strings::StrCat(node.name(), "/", func_body_node.name()));
-
-			// Move the node to the main graph
-			graph->add_node()->Swap(&func_body_node);
-		  }
-
-		  // Add an IdentityN op to hook the function outputs to: this ensures that the
-		  // function body is fully evaluated before its fanout gets scheduled.
-		  NodeDef* func_outputs = graph->add_node();
-		  func_outputs->set_name(node.name());
-		  func_outputs->set_op("IdentityN");
-		  type_list = (*func_outputs->mutable_attr())["T"].mutable_list();
-		  for (const OpDef::ArgDef& arg : func.signature().output_arg()) {
-			auto it = attr.find(arg.type_attr());
-			if (it == attr.end()) {
-			  return errors::InvalidArgument("Invalid output argument ", arg.name(),
-											 " for function ", node.op(),
-											 " instantiated by ", node.name());
-			}
-			type_list->add_type(it->second.type());
-			func_outputs->add_input(strings::StrCat(node.name(), "/", arg.name()));
-		  }
-
-		  */
-		  return Status::OK();
-		}
-
-		Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
-										   GraphDef* optimized_graph) {
-
-		  printf("Function Transformation - enabled by default\n");
-		  *optimized_graph = item.graph;
-
-		  /*
-		  std::unordered_map<string, const FunctionDef*> functions;
-		  for (const FunctionDef& func : item.graph.library().function()) {
-			if (func.attr().count("_noinline") == 0) {
-			  functions[func.signature().name()] = &func;
-			}
-		  }
-
-		  // Nothing to do.
-		  if (functions.empty()) {
-			*optimized_graph = item.graph;
-			return Status::OK();
-		  }
-
-		  // Inline functions when possible.
-		  for (const NodeDef& node : item.graph.node()) {
-			auto it = functions.find(node.op());
-			if (it == functions.end()) {
-			  *optimized_graph->add_node() = node;
-			} else {
-			  TF_RETURN_IF_ERROR(InlineFunction(node, *it->second, optimized_graph));
-			}
-		  }
-
-		  // TODO(bsteiner): specialize the implementation of functions that can't be
-		  // inlined based on the context in which they're instantiated.
-
-		  // TODO(bsteiner): trim the library to remove unused function definitions
-		  *optimized_graph->mutable_library() = item.graph.library();
-		  *optimized_graph->mutable_versions() = item.graph.versions();
-
-		   */
-		  return Status::OK();
-		}
-
-		void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
-										 const GraphDef& optimized_graph,
-										 double result) {
-		  // Nothing to do for FunctionTransformation.
-		}
-
-	}  // end namespace grappler
+    namespace grappler {
+        namespace {
+
+          class FunctionInliningContext {
+            public:
+                explicit FunctionInliningContext(const GrapplerItem& item)
+                        : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
+
+                const FunctionDefLibrary& Library() const { return *library_; }
+
+                bool HasInlinedFunctions() const { return !functions_.empty(); }
+
+                // Find inlining candidate by name. Return nullptr if not found.
+                const FunctionDef* FindInlinedFunction(const string& name) const {
+                  auto it = functions_.find(name);
+                  if (it != functions_.end()) {
+                    return it->second;
+                  } else {
+                    return nullptr;
+                  }
+                }
+
+            private:
+                std::unordered_map<string, const FunctionDef*> InliningCandidates(const GrapplerItem& item) const {
+
+                  std::unordered_map<string, const FunctionDef*> functions;
+
+                  for (const FunctionDef& func : item.graph.library().function()) {
+                    // Don't inline functions marked as noinline
+//                    if (func.attr().count("_noinline") != 0) {
+//                      continue;
+//                    }
+                    // Don't touch anything marked XLA to prevent XLA failures further down
+                    // the road.
+                    if (func.attr().count("_XlaCompile") > 0 &&
+                        func.attr().at("_XlaCompile").b()) {
+                      continue;
+                    }
+                    // Can't create IdentityN nodes with no input or output: skip these
+                    // functions for now.
+                    if (func.signature().input_arg_size() == 0 ||
+                        func.signature().output_arg_size() == 0) {
+                      continue;
+                    }
+                    functions[func.signature().name()] = &func;
+                  }
+                  return functions;
+                }
+
+                const FunctionDefLibrary* library_;
+                std::unordered_map<string, const FunctionDef*> functions_;
+
+                TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+            };
+
+          // Copy input/output argument type to the type_list. Return error if argument
+          // type is not explicitly defined, and not specified in function attributes.
+          Status CopyArgType(const NodeDef& func_node,
+                             const std::unordered_map<string, AttrValue>& func_attr,
+                             const string& arg_kind, const OpDef::ArgDef& arg,
+                             AttrValue::ListValue* type_list) {
+            if (arg.type() != DT_INVALID) {
+              type_list->add_type(arg.type());
+            } else {
+              auto it = func_attr.find(arg.type_attr());
+              if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+                return errors::InvalidArgument(
+                        "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+                        func_node.op(), " instantiated by ", func_node.name());
+              }
+              type_list->add_type(it->second.type());
+            }
+            return Status::OK();
+          }
+
+          // Add an IdentityN op to hook the function inputs to: this ensures that
+          // they're all evaluated before the evaluation of the function body starts.
+          Status HookInlinedFunctionInputs(
+                  const NodeDef& func_node, const FunctionDef& func,
+                  const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
+            inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+            inputs->set_op("IdentityN");
+            inputs->set_device(func_node.device());
+            *inputs->mutable_input() = func_node.input();
+            AttrValue::ListValue* type_list =
+                    (*inputs->mutable_attr())["T"].mutable_list();
+            for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
+              TF_RETURN_IF_ERROR(
+                      CopyArgType(func_node, func_attr, "input", arg, type_list));
+            }
+            return Status::OK();
+          }
+
+          // Add an IdentityN op to hook the function outputs to: this ensures that the
+          // function body is fully evaluated before its fanout gets scheduled.
+          Status HookInlinedFunctionOutputs(
+                  const NodeDef& func_node, const FunctionDef& func,
+                  const std::unordered_map<string, AttrValue>& func_attr,
+                  const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
+            outputs->set_name(func_node.name());
+            outputs->set_op("IdentityN");
+            outputs->set_device(func_node.device());
+            AttrValue::ListValue* type_list =
+                    (*outputs->mutable_attr())["T"].mutable_list();
+            for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+              const OpDef::ArgDef& arg = func.signature().output_arg(i);
+              TF_RETURN_IF_ERROR(
+                      CopyArgType(func_node, func_attr, "output", arg, type_list));
+              // Use the fetch names since they take into account the output mapping.
+              outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+            }
+            return Status::OK();
+          }
+
+          Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
+                                GraphDef* optimized_graph) {
+
+            const std::unordered_map<string, AttrValue> func_attr(
+                    func_node.attr().begin(), func_node.attr().end());
+
+            std::unique_ptr<GrapplerItem> item = GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
+            if (!item) {
+              return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                             " instantiated by ", func_node.name());
+            }
+
+            std::unordered_map<string, int> input_nodes;
+            for (int i = 0; i < func.signature().input_arg_size(); ++i) {
+              const OpDef::ArgDef& arg = func.signature().input_arg(i);
+              input_nodes[arg.name()] = i;
+            }
+
+            // Hook inlined function inputs to IdentityN node
+            NodeDef* func_inputs = optimized_graph->add_node();
+            HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
+
+            for (NodeDef& func_body_node : *item->graph.mutable_node()) {
+              if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+                CHECK_EQ(0, func_body_node.input_size());
+                // Turn input placeholders into identity nodes
+                if (IsPlaceholder(func_body_node)) {
+                  func_body_node.set_op("Identity");
+                }
+
+                int input_id = input_nodes[func_body_node.name()];
+                func_body_node.add_input(strings::StrCat(func_inputs->name(), ":", input_id));
+              }
+
+              else {
+                // Update the input names if any.
+                for (string& input : *func_body_node.mutable_input()) {
+                  input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+                }
+                // If the node has no input, make hook it up to the func_inputs node to
+                // ensure it runs in the same frame as the other nodes of the function
+                // body.
+                if (func_body_node.input_size() == 0) {
+                  *func_body_node.add_input() = AsControlDependency(func_inputs->name());
+                }
+              }
+
+              // Add the node name as a prefix to avoid collisions after inlining
+              func_body_node.set_name(strings::StrCat(func_node.name(), "/", func_body_node.name()));
+
+              // Make sure the node is placed
+              func_body_node.set_device(func_node.device());
+
+              // Check if a body node is itself a function
+//              const FunctionDef* func_body_node_func = ctx.FindInlinedFunction(func_body_node.op());
+
+//              if (func_body_node_func != nullptr) {
+//                // Recursively inline function calls
+//                InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph);
+//              }
+//
+//              else {
+                // Move the node to the main graph
+                optimized_graph->add_node()->Swap(&func_body_node);
+//              }
+            }
+
+            // Hook inlined function outputs to IdentityN node
+            NodeDef* func_outputs = optimized_graph->add_node();
+            HookInlinedFunctionOutputs(func_node, func, func_attr, item->fetch, func_outputs);
+
+            return Status::OK();
+          }
+/*
+            class FakeCPUDevice : public Device {
+            public:
+                FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
+                Status Sync() override { return Status::OK(); }
+            };
+
+            class SymbolicGradientEnv {
+            public:
+                SymbolicGradientEnv(int graph_version, const FunctionDefLibrary& library)
+                        : graph_version_(graph_version), library_(library) {}
+
+                FunctionLibraryDefinition* function_library() {
+                  InitializeIfNeeded();
+                  return fld_.get();
+                }
+                FunctionLibraryRuntime* function_library_runtime() {
+                  InitializeIfNeeded();
+                  return flr_;
+                }
+
+            private:
+                // This initialization is expensive. Do it lazily to avoid paying for it
+                // unless it's needed.
+                void InitializeIfNeeded() {
+                  if (flr_) {
+                    return;
+                  }
+                  Env* env = Env::Default();
+                  DeviceAttributes attr;
+                  attr.set_name("/device:CPU:0");
+                  attr.set_device_type("CPU");
+                  FakeCPUDevice* dev = new FakeCPUDevice(env, attr);
+                  std::vector<Device*> devices;
+                  devices.push_back(dev);
+                  dvc_mgr_.reset(new DeviceMgr(devices));
+                  fld_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), library_));
+                  OptimizerOptions optimizer_opts;
+                  optimizer_opts.set_do_function_inlining(true);
+                  pflr_.reset(new ProcessFunctionLibraryRuntime(
+                          dvc_mgr_.get(), env, graph_version_, fld_.get(), optimizer_opts));
+                  flr_ = pflr_->GetFLR(dev->name());
+                }
+
+                const int graph_version_;
+                const FunctionDefLibrary& library_;
+                std::unique_ptr<DeviceMgr> dvc_mgr_;
+                std::unique_ptr<FunctionLibraryDefinition> fld_;
+                std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+                FunctionLibraryRuntime* flr_ = nullptr;
+            };
+
+            Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
+                                          GraphDef* inlined_graph)
+            {
+              GraphDef graph_def;
+
+              // Create a node to anchor the gradient inputs
+              NodeDef* inlined_input = graph_def.add_node();
+              inlined_input->set_name("FunctionInputs");
+              inlined_input->set_op("IdentityN");
+              AttrValue::ListValue* type_list =
+                      (*inlined_input->mutable_attr())["T"].mutable_list();
+              for (const auto& type : node.attr().at("Tin").list().type()) {
+                type_list->add_type(static_cast<DataType>(type));
+              }
+
+              // Add the gradient node
+              NodeDef* inlined = graph_def.add_node();
+              *inlined = node;
+              inlined->clear_input();
+              for (int i = 0; i < node.attr().at("Tin").list().type_size(); ++i) {
+                inlined->add_input(strings::StrCat(inlined_input->name(), ":", i));
+              }
+
+              // Create a node to anchor the gradient outputs
+              NodeDef* inlined_output = graph_def.add_node();
+              inlined_output->set_name("FunctionOutputs");
+              inlined_output->set_op("IdentityN");
+              type_list = (*inlined_output->mutable_attr())["T"].mutable_list();
+              for (const auto& type : node.attr().at("Tout").list().type()) {
+                type_list->add_type(static_cast<DataType>(type));
+              }
+              for (int i = 0; i < node.attr().at("Tout").list().type_size(); ++i) {
+                inlined_output->add_input(strings::StrCat(inlined->name(), ":", i));
+              }
+
+              // Convert the graphdef to a graph
+              GraphConstructorOptions graph_ctor_opts;
+              graph_ctor_opts.allow_internal_ops = true;
+              graph_ctor_opts.expect_device_spec = false;
+              Graph graph(env->function_library());
+              TF_RETURN_IF_ERROR(
+                      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
+
+              // Recursively inline the functions until there is nothing more to inline. We
+              // should at least expand one function.
+              int counter = 0;
+              while (counter < 50 &&
+                     ExpandInlineFunctions(env->function_library_runtime(), &graph)) {
+                ++counter;
+              }
+
+              GraphDef inlined_graph_def;
+              graph.ToGraphDef(&inlined_graph_def);
+
+              // Add the default values of attributes to the nodes that have been inlined.
+              TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&inlined_graph_def, *graph.op_registry(), 0, true));
+
+              // Add the inlined nodes to the graph
+              for (NodeDef& inlined_node : *inlined_graph_def.mutable_node()) {
+                if (inlined_node.name() == "FunctionOutputs") {
+                  inlined_node.set_name(node.name());
+                  for (int i = 0; i < inlined_node.input_size(); ++i) {
+                    inlined_node.set_input(
+                            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
+                  }
+                } else if (inlined_node.name() == "FunctionInputs") {
+                  inlined_node.set_name(
+                          AddPrefixToNodeName(inlined_node.name(), node.name()));
+                  inlined_node.clear_input();
+                  for (int i = 0; i < node.input_size(); ++i) {
+                    inlined_node.add_input(node.input(i));
+                  }
+                } else {
+                  inlined_node.set_name(
+                          AddPrefixToNodeName(inlined_node.name(), node.name()));
+                  for (int i = 0; i < inlined_node.input_size(); ++i) {
+                    inlined_node.set_input(
+                            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
+                  }
+                  // If the node has no input, hook it up to the function input node to make
+                  // sure it runs in the same frame as the other nodes of the function body.
+                  if (inlined_node.input_size() == 0) {
+                    *inlined_node.add_input() = AsControlDependency(
+                            AddPrefixToNodeName("FunctionInputs", node.name()));
+                  }
+                }
+                inlined_node.set_device(node.device());
+                inlined_graph->add_node()->Swap(&inlined_node);
+              }
+
+              return Status::OK();
+            }
+*/
+        }  // namespace
+
+        Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                           GraphDef* optimized_graph) {
+
+          printf("Function Transformation: Enabled By Default\n");
+
+          FunctionInliningContext function_inlining_ctx(item);
+
+          // Nothing to do here.
+          if (!function_inlining_ctx.HasInlinedFunctions()) {
+            *optimized_graph = item.graph;
+            return Status::OK();
+          }
+
+//          SymbolicGradientEnv env(item.graph.versions().producer(),item.graph.library());
+
+          for (const NodeDef& node : item.graph.node()) {
+//            if (node.op() == "SymbolicGradient") {
+//              TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
+//              continue;
+//            }
+
+            const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
+            if (func != nullptr) {
+              InlineFunction(node, *func, function_inlining_ctx, optimized_graph);
+            }
+
+            else {
+              *optimized_graph->add_node() = node;
+            }
+          }
+
+          *optimized_graph->mutable_versions() = item.graph.versions();
+          *optimized_graph->mutable_library() = item.graph.library();
+
+
+          /******************************************************************************************************/
+          // Dumps optimized graph in a not so readable form
+          const GraphDef* tmp = optimized_graph;
+          printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
+
+          // Write an event, so that we can visualize this optimized graph in tensorboard
+          EventsWriter writer("INLINE");
+          Event event;
+          event.set_wall_time(1234);
+          event.set_step(34);
+
+          const size_t proto_size = optimized_graph->ByteSizeLong();
+          void* buf = port::Malloc(proto_size);
+          if (buf == nullptr) {
+            return tensorflow::errors::ResourceExhausted("Failed to allocate memory to serialize message of type '"
+                                                         ,optimized_graph->GetTypeName(), "' and size ", proto_size);
+          }
+          optimized_graph->SerializeToArray(buf, proto_size);
+          const void* bf = buf;
+          event.set_graph_def(bf, proto_size);
+          writer.WriteEvent(event);
+          /******************************************************************************************************/
+
+          return Status::OK();
+        }
+
+        void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                         const GraphDef& optimized_graph,
+                                         double result) {
+          // Nothing to do for FunctionOptimizer.
+        }
+
+    }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 369e0003c7..8333ddf134 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -26,115 +26,128 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
-	namespace grappler {
-
-		std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-				const FunctionDef& func,
-				const std::unordered_map<string, AttrValue>& func_attr,
-				const FunctionDefLibrary& library) {
-		  if (func.signature().name().empty()) {
-			LOG(ERROR) << "function name must be specified.";
-			return nullptr;
-		  }
-		  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-		  new_item->id = func.signature().name();
-
-		  std::unordered_map<string, string> port_map;
-
-		  // Add the function inputs as placeholder
-		  for (const auto& inp : func.signature().input_arg()) {
-			NodeDef* ph = new_item->graph.add_node();
-			ph->set_name(inp.name());
-			ph->set_op("Placeholder");
-			if (inp.type() != DT_INVALID) {
-			  (*ph->mutable_attr())["T"].set_type(inp.type());
-			} else {
-			  auto it = func_attr.find(inp.type_attr());
-			  if (it == func_attr.end()) {
-				LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
-						   << " for function input " << inp.name();
-				return nullptr;
-			  } else {
-				(*ph->mutable_attr())["T"] = it->second;
-			  }
-			}
-			port_map[inp.name()] = inp.name();
-		  }
-
-		  // Add the function body to the graph.
-		  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
-
-		  for (const NodeDef& node : func.node_def()) {
-			NodeDef* new_node = new_item->graph.add_node();
-			*new_node = node;
-			// Replace the placeholder attribute values with the specified value.
-			for (auto& attr : *new_node->mutable_attr()) {
-			  const string& ph_name = attr.second.placeholder();
-			  auto it = func_attr.find(ph_name);
-			  if (it != func_attr.end()) {
-				attr.second = it->second;
-			  }
-			}
-
-			// Functions use a custom format to encode connectivity. Map these custom
-			// strings to regular ones.
-			const OpRegistrationData* registration;
-			Status status = func_def.LookUp(node.op(), &registration);
-			if (!status.ok()) {
-			  LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
-			  return nullptr;
-			}
-
-			tensorflow::NameRangeMap inputs;
-			tensorflow::NameRangeMap outputs;
-			status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
-												   &outputs);
-			if (!status.ok()) {
-			  LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
-			  return nullptr;
-			}
-			for (const auto& name_range : outputs) {
-			  string port_prefix =
-					  strings::StrCat(node.name(), ":", name_range.first, ":");
-			  int index_start = name_range.second.first;
-			  int index_end = name_range.second.second;
-			  for (int i = index_start; i < index_end; i) {
-				string port_id = strings::StrCat(port_prefix, i - index_start);
-				string port_name = strings::StrCat(node.name(), ":", i);
-				port_map[port_id] = port_name;
-			  }
-			}
-		  }
-
-		  for (auto& node : *new_item->graph.mutable_node()) {
-			// Rewrite the inputs to use the normal naming convention.
-			for (int i = 0; i < node.input_size(); i) {
-			  const string& input = node.input(i);
-			  if (IsControlInput(input)) {
-				// No need to remap control dependencies.
-				continue;
-			  } else {
-				auto it = port_map.find(input);
-				if (it == port_map.end()) {
-				  LOG(ERROR) << "Unknown input: " << input;
-				  return nullptr;
+		namespace grappler {
+
+				std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+								const FunctionDef& func,
+								const std::unordered_map<string, AttrValue>& func_attr,
+								const FunctionDefLibrary& library) {
+					if (func.signature().name().empty()) {
+						LOG(ERROR) << "function name must be specified.";
+						return nullptr;
+					}
+					std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
+					new_item->id = func.signature().name();
+
+					std::unordered_map<string, string> port_map;
+
+					// Add the function inputs as placeholder
+					for (const auto& inp : func.signature().input_arg()) {
+						NodeDef* ph = new_item->graph.add_node();
+						ph->set_name(inp.name());
+						ph->set_op("Placeholder");
+						if (inp.type() != DT_INVALID) {
+							(*ph->mutable_attr())["T"].set_type(inp.type());
+						} else {
+							auto it = func_attr.find(inp.type_attr());
+							if (it == func_attr.end()) {
+								LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
+													 << " for function input " << inp.name();
+								return nullptr;
+							} else {
+								(*ph->mutable_attr())["T"] = it->second;
+							}
+						}
+						port_map[inp.name()] = inp.name();
+					}
+
+					// Add the function body to the graph.
+					FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+
+					for (const NodeDef& node : func.node_def()) {
+						NodeDef* new_node = new_item->graph.add_node();
+						*new_node = node;
+						// Replace the placeholder attribute values with the specified value.
+						for (auto& attr : *new_node->mutable_attr()) {
+							const string& ph_name = attr.second.placeholder();
+							auto it = func_attr.find(ph_name);
+							if (it != func_attr.end()) {
+								attr.second = it->second;
+							}
+						}
+
+						// Functions use a custom format to encode connectivity. Map these custom
+						// strings to regular ones.
+						const OpRegistrationData* registration;
+						Status status = func_def.LookUp(node.op(), &registration);
+						if (!status.ok()) {
+							LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
+							return nullptr;
+						}
+
+						tensorflow::NameRangeMap inputs;
+						tensorflow::NameRangeMap outputs;
+						status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
+																									 &outputs);
+						if (!status.ok()) {
+							LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
+							return nullptr;
+						}
+						for (const auto& name_range : outputs) {
+							string port_prefix =
+											strings::StrCat(node.name(), ":", name_range.first, ":");
+							int index_start = name_range.second.first;
+							int index_end = name_range.second.second;
+							for (int i = index_start; i < index_end; ++i) {
+								string port_id = strings::StrCat(port_prefix, i - index_start);
+								string port_name = strings::StrCat(node.name(), ":", i);
+								port_map[port_id] = port_name;
+							}
+						}
+					}
+
+					for (auto& node : *new_item->graph.mutable_node()) {
+						// Rewrite the inputs to use the normal naming convention.
+						for (int i = 0; i < node.input_size(); ++i) {
+							const string& input = node.input(i);
+							if (IsControlInput(input)) {
+								// No need to remap control dependencies.
+								continue;
+							} else {
+								auto it = port_map.find(input);
+								if (it == port_map.end()) {
+									LOG(ERROR) << "Unknown input: " << input;
+									return nullptr;
+								}
+								node.set_input(i, it->second);
+							}
+						}
+					}
+
+					// Add the function outputs to the list of fetch nodes, taking into account
+					// the output mapping if any.
+					for (const auto& out : func.signature().output_arg()) {
+						auto it = func.ret().find(out.name());
+						if (it != func.ret().end()) {
+							auto it2 = port_map.find(it->second);
+							if (it2 == port_map.end()) {
+								LOG(ERROR) << "Unknown output mapping: " << it->first << " to "
+													 << it->second;
+								return nullptr;
+							} else {
+								new_item->fetch.emplace_back(it2->second);
+							}
+						} else {
+							new_item->fetch.emplace_back(out.name());
+						}
+					}
+					// Add the function inputs to the list of feeds.
+					for (const auto& inp : func.signature().input_arg()) {
+						new_item->feed.emplace_back(inp.name(), Tensor());
+					}
+
+					return new_item;
 				}
-				node.set_input(i, it->second);
-			  }
-			}
-		  }
-
-		  // Add the function outputs to the list of fetch nodes.
-		  for (const auto& out : func.signature().output_arg()) {
-			new_item->fetch.emplace_back(out.name());
-		  }
-		  // Add the function inputs to the list of feeds.
-		  for (const auto& inp : func.signature().input_arg()) {
-			new_item->feed.emplace_back(inp.name(), Tensor());
-		  }
-
-		  return new_item;
-		}
-
-	}  // end namespace grappler
+
+		}  // end namespace grappler
 }  // end namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 04633282a7..6d0eed3fa6 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -24,16 +24,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-	namespace grappler {
+		namespace grappler {
 
 // Factory method for creating a GrapplerItem from a FunctionDef.
 // Returns nullptr if the given function def cannot be converted.
-		std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-				const FunctionDef& func,
-				const std::unordered_map<string, AttrValue>& func_attr,
-				const FunctionDefLibrary& library);
+				std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+								const FunctionDef& func,
+								const std::unordered_map<string, AttrValue>& func_attr,
+								const FunctionDefLibrary& library);
 
-	}  // end namespace grappler
+		}  // end namespace grappler
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
\ No newline at end of file

From 3d60a2ff1d323cfd496754d17d07c7dc7878cc1e Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Sat, 16 Jun 2018 23:27:27 +0300
Subject: [PATCH 03/64] Transformation_v.1

- Merge / non existing op (IdentityN for now) / has as many inputs as the call sites
- Enter/ Exit ops are simple IdentityN nodes for now.
---
 TESTS/callg.py                                |  33 +++++
 TESTS/factorial.py                            |   6 +-
 TESTS/func.py                                 |  14 +-
 .../optimizers/function_transformation.cc     | 131 ++++++++++++++----
 .../optimizers/function_transformation.h      |   9 ++
 5 files changed, 155 insertions(+), 38 deletions(-)
 create mode 100644 TESTS/callg.py

diff --git a/TESTS/callg.py b/TESTS/callg.py
new file mode 100644
index 0000000000..9d74584809
--- /dev/null
+++ b/TESTS/callg.py
@@ -0,0 +1,33 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+@function.Defun(tf.float32)
+def G(x):
+	return [x + x]
+
+
+@function.Defun(tf.float32, tf.float32)
+def MyFunc(x, y):
+	return [G(x), G(y)]
+
+
+# Building the graph.
+
+a = tf.constant([4.0], name="a")
+b = tf.placeholder(tf.float32, name="MyPlaceHolder")
+
+add = tf.add(a, b, name="add")
+sub = tf.subtract(a, b, name="sub")
+
+[c,d] = MyFunc(add, sub, name='mycall')
+
+x = tf.add(c, d)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session() as sess:			# no need to manually close the session
+#	print(sess.run([add, sub], feed_dict={b:1}))
+	print(sess.run([x], feed_dict={b:1}))
+	#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+writer.close()
diff --git a/TESTS/factorial.py b/TESTS/factorial.py
index fa7adb6cbc..e2a3dda6ae 100644
--- a/TESTS/factorial.py
+++ b/TESTS/factorial.py
@@ -17,11 +17,11 @@ def FacImpl(n):
 result = fac(x)
 y = tf.add(result, 1)
 
-writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
 sess = tf.Session()
-print(sess.run(result, feed_dict={n: 3}))
+print(sess.run(y, feed_dict={n: 3}))
 
-writer.close()
+#writer.close()
 
 sess.close()
diff --git a/TESTS/func.py b/TESTS/func.py
index a0cbd065d5..6ceec896ae 100644
--- a/TESTS/func.py
+++ b/TESTS/func.py
@@ -3,7 +3,7 @@
 
 @function.Defun(tf.float32, tf.float32)
 def MyFunc(x, y):
-	return x + y, x - y
+	return [x + y, x - y]
 
 
 # Building the graph.
@@ -14,13 +14,15 @@ def MyFunc(x, y):
 add = tf.add(a, b, name="add")
 sub = tf.subtract(a, b, name="sub")
 
-c, d = MyFunc(add, sub, name='mycall')
+[c,d] = MyFunc(add, sub, name='mycall')
 
-#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+x = tf.add(c, d)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
 with tf.Session() as sess:			# no need to manually close the session
-	print(sess.run([add, sub], feed_dict={b:1}))
-	print(sess.run([c,d], feed_dict={b:1}))
+#	print(sess.run([add, sub], feed_dict={b:1}))
+	print(sess.run([x], feed_dict={b:1}))
 	#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
-#writer.close()
+writer.close()
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 9f31715a68..7d3fa343fe 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -93,6 +93,7 @@ namespace tensorflow {
                              const std::unordered_map<string, AttrValue>& func_attr,
                              const string& arg_kind, const OpDef::ArgDef& arg,
                              AttrValue::ListValue* type_list) {
+
             if (arg.type() != DT_INVALID) {
               type_list->add_type(arg.type());
             } else {
@@ -112,15 +113,14 @@ namespace tensorflow {
           Status HookInlinedFunctionInputs(
                   const NodeDef& func_node, const FunctionDef& func,
                   const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
-            inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+
+            inputs->set_name(strings::StrCat(func_node.name(), "/", "Enter"));
             inputs->set_op("IdentityN");
             inputs->set_device(func_node.device());
-            *inputs->mutable_input() = func_node.input();
-            AttrValue::ListValue* type_list =
-                    (*inputs->mutable_attr())["T"].mutable_list();
+            *inputs->mutable_input() = func_node.input();       //IdentityN node steals the inputs from Func node
+            AttrValue::ListValue* type_list = (*inputs->mutable_attr())["T"].mutable_list();
             for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-              TF_RETURN_IF_ERROR(
-                      CopyArgType(func_node, func_attr, "input", arg, type_list));
+              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, type_list));
             }
             return Status::OK();
           }
@@ -131,37 +131,84 @@ namespace tensorflow {
                   const NodeDef& func_node, const FunctionDef& func,
                   const std::unordered_map<string, AttrValue>& func_attr,
                   const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
-            outputs->set_name(func_node.name());
+
+            // Exit Node
+            outputs->set_name(func_node.name());//strings::StrCat(func_node.name(), "/", "Exit"));
             outputs->set_op("IdentityN");
             outputs->set_device(func_node.device());
-            AttrValue::ListValue* type_list =
-                    (*outputs->mutable_attr())["T"].mutable_list();
+            AttrValue::ListValue* type_list = (*outputs->mutable_attr())["T"].mutable_list();
             for (int i = 0; i < func.signature().output_arg_size(); ++i) {
               const OpDef::ArgDef& arg = func.signature().output_arg(i);
-              TF_RETURN_IF_ERROR(
-                      CopyArgType(func_node, func_attr, "output", arg, type_list));
+              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, type_list));
               // Use the fetch names since they take into account the output mapping.
               outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
             }
             return Status::OK();
           }
 
+
+            Status CreateCycle(NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
+                            GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> functions_in) {
+
+              printf("Recursion Detected\n");
+
+              const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
+
+              NodeDef* merge;
+              ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
+
+              // Hook inlined function inputs to IdentityN node
+              NodeDef* func_inputs = optimized_graph->add_node();
+              HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
+
+              // Hook IdentityN node's outputs to func's Merges nodes
+              for (int i = 0; i < func.signature().input_arg_size(); ++i) {
+                const OpDef::ArgDef &arg = func.signature().input_arg(i);
+
+                merge = argmerge_map[arg.name()];
+                merge->add_input(strings::StrCat(func_inputs->name(), ":", i));
+              }
+
+              // Hook inlined function outputs to IdentityN node
+              string name = func_node.name();
+              func_node.set_name(func_node.op());
+              NodeDef* func_outputs = optimized_graph->add_node();
+              HookInlinedFunctionOutputs(func_node, func, func_attr, functions_in[func_node.op()].fetch, func_outputs);
+              // Re-set node's name - I wanted to avoid changing HookInlinedFunctionOutputs
+              func_outputs->set_name(name);
+
+              return Status::OK();
+          }
+
+
           Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
-                                GraphDef* optimized_graph) {
+                              GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> functions_in) {
 
-            const std::unordered_map<string, AttrValue> func_attr(
-                    func_node.attr().begin(), func_node.attr().end());
+            const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
             std::unique_ptr<GrapplerItem> item = GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
             if (!item) {
-              return errors::InvalidArgument("Failed to inline function ", func_node.op(),
-                                             " instantiated by ", func_node.name());
+              return errors::InvalidArgument("Failed to inline function ", func_node.op(), " instantiated by ", func_node.name());
             }
 
+            functions_in[func_node.op()].fetch = item->fetch;
+            ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
+            NodeDef* merge;
             std::unordered_map<string, int> input_nodes;
             for (int i = 0; i < func.signature().input_arg_size(); ++i) {
               const OpDef::ArgDef& arg = func.signature().input_arg(i);
               input_nodes[arg.name()] = i;
+
+              // Create a merge node for every input arg
+              merge = optimized_graph->add_node();
+              // Initialize merge
+              merge->set_name(strings::StrCat(func_node.name(), "/", "Merge_", arg.name()));
+              merge->set_op("IdentityN");
+              merge->set_device(func_node.device());
+
+              // Merge's  Attrs will be initialized later
+              argmerge_map.emplace(arg.name(), merge);
+
             }
 
             // Hook inlined function inputs to IdentityN node
@@ -169,17 +216,22 @@ namespace tensorflow {
             HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
 
             for (NodeDef& func_body_node : *item->graph.mutable_node()) {
+
+              // If the func body node is func's input argument, these nodes will now take input from IdentityN
               if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
                 CHECK_EQ(0, func_body_node.input_size());
                 // Turn input placeholders into identity nodes
                 if (IsPlaceholder(func_body_node)) {
                   func_body_node.set_op("Identity");
                 }
-
+                // Connect merge with input arg
+                func_body_node.add_input(argmerge_map[func_body_node.name()]->name());
+                // Connect IdentityN to merge
                 int input_id = input_nodes[func_body_node.name()];
-                func_body_node.add_input(strings::StrCat(func_inputs->name(), ":", input_id));
+                argmerge_map[func_body_node.name()]->add_input(strings::StrCat(func_inputs->name(), ":", input_id));
               }
 
+              // Else if not an input_arg_node
               else {
                 // Update the input names if any.
                 for (string& input : *func_body_node.mutable_input()) {
@@ -200,17 +252,32 @@ namespace tensorflow {
               func_body_node.set_device(func_node.device());
 
               // Check if a body node is itself a function
-//              const FunctionDef* func_body_node_func = ctx.FindInlinedFunction(func_body_node.op());
-
-//              if (func_body_node_func != nullptr) {
-//                // Recursively inline function calls
-//                InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph);
-//              }
-//
-//              else {
+              const FunctionDef* func_body_node_func = ctx.FindInlinedFunction(func_body_node.op());
+
+              // Node is yet another function
+              if (func_body_node_func != nullptr) {
+
+                // Check if that function has already been inlined
+                auto it = functions_in.find(func_body_node.op());
+
+                // Not already in => Inline it
+                if (it == functions_in.end()) {
+                  FuncInfo func_info;
+                  functions_in.emplace(func_body_node.op(), func_info);
+                  InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in);
+                  functions_in.erase(func_body_node.op());
+                }
+                // Already in -> Insert Enter/Exit ops end feed back / create cycle
+                //  (recursion or mutually recursive functions)
+                else {
+                  CreateCycle(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in);
+                }
+              }
+
+              else {
                 // Move the node to the main graph
                 optimized_graph->add_node()->Swap(&func_body_node);
-//              }
+              }
             }
 
             // Hook inlined function outputs to IdentityN node
@@ -366,6 +433,7 @@ namespace tensorflow {
 */
         }  // namespace
 
+
         Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
                                            GraphDef* optimized_graph) {
 
@@ -381,6 +449,9 @@ namespace tensorflow {
 
 //          SymbolicGradientEnv env(item.graph.versions().producer(),item.graph.library());
 
+//          std::unordered_map<string, ArgMerge> argmerge_map;
+          std::unordered_map<string, FuncInfo> functions_in;
+
           for (const NodeDef& node : item.graph.node()) {
 //            if (node.op() == "SymbolicGradient") {
 //              TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
@@ -389,9 +460,11 @@ namespace tensorflow {
 
             const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
             if (func != nullptr) {
-              InlineFunction(node, *func, function_inlining_ctx, optimized_graph);
+              FuncInfo func_info;
+              functions_in.emplace(node.op(), func_info);
+              InlineFunction(node, *func, function_inlining_ctx, optimized_graph, functions_in);
+              functions_in.erase(node.op());      // At this point functions_in will be empty
             }
-
             else {
               *optimized_graph->add_node() = node;
             }
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.h b/tensorflow/core/grappler/optimizers/function_transformation.h
index 514b55e0df..1caf2dd761 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.h
+++ b/tensorflow/core/grappler/optimizers/function_transformation.h
@@ -17,10 +17,19 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_TRANSFORMATION_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
   namespace grappler {
 
+
+  typedef std::unordered_map<string, NodeDef*> ArgMergeMap;
+
+  typedef struct {
+      ArgMergeMap argMergeMap;
+      gtl::ArraySlice<string> fetch;
+  } FuncInfo;
+
 // Replace function calling nodes with pairs of new 'Call/Return' operators
 // operations to make the overall graph more efficient.
 

From 26d33f99886da4d0f6130ca87e14d9d590767b20 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Sun, 17 Jun 2018 04:42:56 +0300
Subject: [PATCH 04/64] Transformation_v.2

- Multiple Merge ops
- Enter/ Exit ops are simple IdentityN nodes for now.
---
 TESTS/fib.py                                  |  10 +-
 .../optimizers/function_transformation.cc     | 116 ++++++++++++++----
 .../optimizers/function_transformation.h      |   1 -
 3 files changed, 94 insertions(+), 33 deletions(-)

diff --git a/TESTS/fib.py b/TESTS/fib.py
index f4adf29944..5921a1c1b8 100644
--- a/TESTS/fib.py
+++ b/TESTS/fib.py
@@ -7,21 +7,21 @@
 def FibImpl(n):
 	return tf.cond(tf.less_equal(n, 1),
 		lambda: tf.constant(1),
-		lambda: fib(n-1))
+		lambda: fib(n-1) + fib(n-2))
 
 FibImpl.add_to_graph(tf.get_default_graph())
 
 n = tf.placeholder(tf.int32, shape=[])
 res = fib(n)
 
-writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
 sess = tf.Session()
 
-print(tf.get_default_graph().as_graph_def())
+#print(tf.get_default_graph().as_graph_def())
 
 
-writer.close()
-#print(sess.run(res, feed_dict={n: 0}))
+#writer.close()
+print(sess.run(res, feed_dict={n: 4}))
 
 sess.close()
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 7d3fa343fe..665a72264f 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -147,37 +147,37 @@ namespace tensorflow {
           }
 
 
-            Status CreateCycle(NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
-                            GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> functions_in) {
+          Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
+                             std::unordered_map<string, FuncInfo> functions_in) {
 
-              printf("Recursion Detected\n");
+            printf("Recursion Detected\n");
 
-              const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
+            const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
-              NodeDef* merge;
-              ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
+            NodeDef* merge;
+            ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
 
-              // Hook inlined function inputs to IdentityN node
-              NodeDef* func_inputs = optimized_graph->add_node();
-              HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
+            // Hook inlined function inputs to Enter node
+            NodeDef* func_inputs = optimized_graph->add_node();
+            HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
 
-              // Hook IdentityN node's outputs to func's Merges nodes
-              for (int i = 0; i < func.signature().input_arg_size(); ++i) {
-                const OpDef::ArgDef &arg = func.signature().input_arg(i);
+            // Hook Enter node's outputs to func's Merges nodes
+            for (int i = 0; i < func.signature().input_arg_size(); ++i) {
+              const OpDef::ArgDef &arg = func.signature().input_arg(i);
 
-                merge = argmerge_map[arg.name()];
-                merge->add_input(strings::StrCat(func_inputs->name(), ":", i));
-              }
+              merge = argmerge_map[arg.name()];
+              merge->add_input(strings::StrCat(func_inputs->name(), ":", i));
+            }
 
-              // Hook inlined function outputs to IdentityN node
-              string name = func_node.name();
-              func_node.set_name(func_node.op());
-              NodeDef* func_outputs = optimized_graph->add_node();
-              HookInlinedFunctionOutputs(func_node, func, func_attr, functions_in[func_node.op()].fetch, func_outputs);
-              // Re-set node's name - I wanted to avoid changing HookInlinedFunctionOutputs
-              func_outputs->set_name(name);
+            // Hook inlined function outputs to Exit node
+            string name = func_node.name();
+            func_node.set_name(func_node.op());
+            NodeDef* func_outputs = optimized_graph->add_node();
+            HookInlinedFunctionOutputs(func_node, func, func_attr, functions_in[func_node.op()].fetch, func_outputs);
+            // Re-set node's name - I wanted to avoid changing HookInlinedFunctionOutputs
+            func_outputs->set_name(name);
 
-              return Status::OK();
+            return Status::OK();
           }
 
 
@@ -211,7 +211,7 @@ namespace tensorflow {
 
             }
 
-            // Hook inlined function inputs to IdentityN node
+            // Hook inlined function inputs to Enter node
             NodeDef* func_inputs = optimized_graph->add_node();
             HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
 
@@ -267,10 +267,10 @@ namespace tensorflow {
                   InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in);
                   functions_in.erase(func_body_node.op());
                 }
-                // Already in -> Insert Enter/Exit ops end feed back / create cycle
+                // Already in -> Insert Enter/Exit ops end create cycle
                 //  (recursion or mutually recursive functions)
                 else {
-                  CreateCycle(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in);
+                  CreateCycle(func_body_node, *func_body_node_func, optimized_graph, functions_in);
                 }
               }
 
@@ -280,10 +280,72 @@ namespace tensorflow {
               }
             }
 
-            // Hook inlined function outputs to IdentityN node
+            // Hook inlined function outputs to Exit node
             NodeDef* func_outputs = optimized_graph->add_node();
             HookInlinedFunctionOutputs(func_node, func, func_attr, item->fetch, func_outputs);
 
+            // Break Merges into multiple common Merge ops and fix their attrs
+            int j=0;
+
+            for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
+
+              NodeDef *new_merge;
+              merge = it->second;
+
+              DataType type;
+              const OpDef::ArgDef& arg = func.signature().input_arg(j);
+
+              if (arg.type() != DT_INVALID) {
+                type = arg.type();
+              } else {
+                auto it = func_attr.find(arg.type_attr());
+                if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+                  return errors::InvalidArgument(
+                          "Invalid argument ", arg.name(), " for function ",
+                          func_node.op(), " instantiated by ", func_node.name());
+                }
+                type = it->second.type();
+              }
+
+              int i, size = merge->input_size();
+
+              // If there is only one call site, leave Merge as it is (IdentityN node)
+              // and it will be eliminated by other optimizers
+              if (size < 2)
+                break;
+
+              string name = merge->name();
+              string in1 = merge->input(0), in2;
+
+              for (i=1; i < size-1; i++) {
+
+                in2 = merge->input(i);
+
+                new_merge = optimized_graph->add_node();
+
+                // Initialize new node
+                name = strings::StrCat(name, merge->input_size()-i-1);
+                new_merge->set_name(name);
+                new_merge->set_op("Merge");
+                new_merge->set_device(func_node.device());
+                new_merge->add_input(in1);
+                new_merge->add_input(in2);
+                (*new_merge->mutable_attr())["T"].set_type(type);
+
+                in1 = name;
+              }
+
+              // Modify initial Merge
+              in2 = merge->input(i);
+              merge->set_op("Merge");
+              merge->set_device(func_node.device());
+              merge->clear_input();
+              merge->add_input(in1);
+              merge->add_input(in2);
+              (*merge->mutable_attr())["T"].set_type(type);
+
+            }
+
             return Status::OK();
           }
 /*
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.h b/tensorflow/core/grappler/optimizers/function_transformation.h
index 1caf2dd761..9c13372572 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.h
+++ b/tensorflow/core/grappler/optimizers/function_transformation.h
@@ -31,7 +31,6 @@ namespace tensorflow {
   } FuncInfo;
 
 // Replace function calling nodes with pairs of new 'Call/Return' operators
-// operations to make the overall graph more efficient.
 
 	class FunctionTransformation : public GraphOptimizer {
 	public:

From 843c3cb960bc65c40b7bc04f683c07665b83b3ef Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Sun, 17 Jun 2018 20:08:45 +0300
Subject: [PATCH 05/64] Preparing framework/core for new ops
 Call/Return/NextCall

- I'm gonna break Call/Return (IdentityN-like) nodes to
  multiple Call/Return (Identity-like) nodes similar to
  iteration's ops, so that I can steal code more easily and
  allow more parallelism.

- I'm gonna add a 'NextCall' op as well, for the sake of consistency.
---
 TESTS/fib.py                                  |   6 +-
 .../common_runtime/graph_execution_state.cc   |  25 +++
 tensorflow/core/graph/graph.cc                |   3 +
 tensorflow/core/graph/graph.h                 |   9 +
 tensorflow/core/graph/graph_constructor.cc    |  68 +++++++-
 tensorflow/core/grappler/op_types.cc          |  15 ++
 tensorflow/core/grappler/op_types.h           |   3 +
 .../optimizers/function_transformation.cc     |   4 +-
 .../grappler/optimizers/meta_optimizer.cc     |   4 +-
 tensorflow/core/ops/control_flow_ops.cc       | 164 ++++++++++++++++--
 10 files changed, 270 insertions(+), 31 deletions(-)

diff --git a/TESTS/fib.py b/TESTS/fib.py
index 5921a1c1b8..8e5c330070 100644
--- a/TESTS/fib.py
+++ b/TESTS/fib.py
@@ -12,7 +12,9 @@ def FibImpl(n):
 FibImpl.add_to_graph(tf.get_default_graph())
 
 n = tf.placeholder(tf.int32, shape=[])
-res = fib(n)
+x = fib(n)
+
+res = tf.add(x, 1)
 
 #writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
@@ -22,6 +24,6 @@ def FibImpl(n):
 
 
 #writer.close()
-print(sess.run(res, feed_dict={n: 4}))
+print(sess.run(res, feed_dict={n: 5}))
 
 sess.close()
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 4bd40c7978..dcc47c07fe 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/util/event.pb.h"
+#include "tensorflow/core/util/events_writer.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
@@ -356,11 +358,34 @@ Status GraphExecutionState::OptimizeGraph(
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
         item, rewrite_options, cpu_device, &cluster, &new_graph));
+
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     optimized_graph->reset(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(
         ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
+/*******************************************************************************************/
+    // Write an event, so that we can visualize this optimized graph in tensorboard
+    EventsWriter writer("Fully_Optimized");
+    Event event;
+    event.set_wall_time(1234);
+    event.set_step(34);
+
+    const size_t proto_size = new_graph.ByteSizeLong();
+    void* buf = port::Malloc(proto_size);
+    if (buf == nullptr) {
+      return tensorflow::errors::ResourceExhausted("Failed to allocate memory to serialize message of type '"
+              ,new_graph.GetTypeName(), "' and size ", proto_size);
+    }
+    new_graph.SerializeToArray(buf, proto_size);
+    const void* bf = buf;
+    event.set_graph_def(bf, proto_size);
+    writer.WriteEvent(event);
+
+    printf(" Test\n");
+/*******************************************************************************************/
+
+
     // The graph conversion sets the requested device names but not the assigned
     // device names. However, since at this point the graph is placed TF expects
     // an assigned device name for every node. Therefore we copy the requested
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 45ab38c395..948d364ddd 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -61,6 +61,9 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         REF_CLASS("Enter", NC_ENTER),
         REF_CLASS("Exit", NC_EXIT),
         REF_CLASS("NextIteration", NC_NEXT_ITERATION),
+        REF_CLASS("Call", NC_CALL),
+        REF_CLASS("Return", NC_RETURN),
+        REF_CLASS("NextCall", NC_NEXT_CALL),
         {"LoopCond", NC_LOOP_COND},
         {"ControlTrigger", NC_CONTROL_TRIGGER},
         {"_Send", NC_SEND},
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 72c8d38cb9..e7b81b0531 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -142,6 +142,9 @@ class Node {
   bool IsEnter() const { return class_ == NC_ENTER; }
   bool IsExit() const { return class_ == NC_EXIT; }
   bool IsNextIteration() const { return class_ == NC_NEXT_ITERATION; }
+  bool IsCall() const { return class_ == NC_ENTER; }
+  bool IsReturn() const { return class_ == NC_EXIT; }
+  bool IsNextCall() const { return class_ == NC_NEXT_ITERATION; }
   bool IsLoopCond() const { return class_ == NC_LOOP_COND; }
   bool IsControlTrigger() const { return class_ == NC_CONTROL_TRIGGER; }
   bool IsSend() const { return class_ == NC_SEND || class_ == NC_HOST_SEND; }
@@ -219,6 +222,9 @@ class Node {
     NC_ENTER,
     NC_EXIT,
     NC_NEXT_ITERATION,
+    NC_CALL,
+    NC_RETURN,
+    NC_NEXT_CALL,
     NC_LOOP_COND,
     NC_CONTROL_TRIGGER,
     NC_SEND,
@@ -646,6 +652,9 @@ inline bool IsMerge(const Node* node) { return node->IsMerge(); }
 inline bool IsEnter(const Node* node) { return node->IsEnter(); }
 inline bool IsExit(const Node* node) { return node->IsExit(); }
 inline bool IsNextIteration(const Node* n) { return n->IsNextIteration(); }
+inline bool IsCall(const Node* node) { return node->IsCall(); }
+inline bool IsReturn(const Node* node) { return node->IsCall(); }
+inline bool IsNextCall(const Node* n) { return n->IsNextCall(); }
 inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
 inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
 inline bool IsSend(const Node* node) { return node->IsSend(); }
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 8dcb6798c1..968766f80d 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -52,6 +52,11 @@ inline bool IsNextIteration(const NodeDef& node_def) {
          node_def.op() == "RefNextIteration";
 }
 
+inline bool IsNextCall(const NodeDef& node_def) {
+  return node_def.op() == "NextCall" ||
+         node_def.op() == "RefNextCall";
+}
+
 bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
   using ::tensorflow::strings::Scanner;
   return Scanner(s)
@@ -201,6 +206,7 @@ class GraphConstructor {
     int gdef_index;
     Node* node;  // nullptr until the NodeDef is converted to a Node.
   };
+
   // TODO(vrv): Profile this data structure to see if we should use an
   // alternative implementation of std::unordered_map.
   std::unordered_map<StringPiece, NodeInfo, StringPiece::Hasher> gdef_nodes_;
@@ -244,6 +250,9 @@ class GraphConstructor {
   std::vector<EdgeInfo> back_edges_;
 };
 
+
+
+
 // This could be expensive but we don't expect to call it often, if at all (only
 // if there are multiple nodes in g_ with the same name)
 bool NodeNameInValues(const std::map<TensorId, TensorId>& input_map,
@@ -389,31 +398,59 @@ std::unordered_set<string> GetNextIterationNodes(
   return next_iteration_nodes;
 }
 
+
+std::unordered_set<string> GetNextCallNodes(
+        const GraphConstructor::NodeDefSlice& node_defs) {
+  std::unordered_set<string> next_call_nodes;
+
+  for (int n = 0; n < node_defs.size(); ++n) {
+    const NodeDef& node_def = *node_defs[n];
+    if (IsNextCall(node_def)) {
+      next_call_nodes.insert(node_def.name());
+    }
+  }
+
+  return next_call_nodes;
+}
+
+
 Status GraphConstructor::InitFromEdges() {
   const int num_nodes = node_defs_.size();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
-  std::unordered_set<string> next_iteration_nodes_ =
-      GetNextIterationNodes(node_defs_);
+  std::unordered_set<string> next_iteration_nodes_ = GetNextIterationNodes(node_defs_);
+  std::unordered_set<string> next_call_nodes_ = GetNextCallNodes(node_defs_);
+
 
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
     const NodeDef& node_def = *node_defs_[n];
+
     if (IsMerge(node_def)) {
-      // Cycles in the graph are only allowed for while loops. A while loop is
-      // identified by an edge from a NextIteration node to a Merge node. For
-      // such Merge nodes, only wait for one non-control input before
+      // Cycles in the graph are only allowed for while loops and recursion.
+      // A while loop is identified by an edge from a NextIteration node to a Merge node.
+
+      // A recursion is identified by an edge from a NextCall Node to a Merge node
+
+      // For such Merge nodes, only wait for one non-control input before
       // considering the node ready to process in Convert().
       int32 num_control_edges = 0;
       bool has_loop_back_edge = false;
+
       for (int i = 0; i < node_def.input_size(); ++i) {
+
         StringPiece input_name(node_def.input(i));
+
         if (input_name.starts_with("^")) {
           num_control_edges++;
-        } else {
+        }
+        else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_nodes_.find(id.first.ToString()) !=
-              next_iteration_nodes_.end()) {
+          if (next_iteration_nodes_.find(id.first.ToString()) !=next_iteration_nodes_.end()) {
+            has_loop_back_edge = true;
+          }
+
+          if (next_call_nodes_.find(id.first.ToString()) != next_call_nodes_.end()) {
             has_loop_back_edge = true;
           }
         }
@@ -423,13 +460,20 @@ Status GraphConstructor::InitFromEdges() {
       } else {
         pending_count_.push_back(node_def.input_size());
       }
-    } else {
+    }
+
+
+    else {
       pending_count_.push_back(node_def.input_size());
     }
+
+
     if (node_def.input_size() == 0) {
       ready_.push_back(n);
       continue;
     }
+
+
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
       TensorId id(ParseTensorName(input_name));
@@ -940,6 +984,9 @@ Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
 
 }  // namespace
 
+
+
+
 Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                               const GraphDef& gdef, Graph* g) {
   ShapeRefiner refiner(gdef.versions().producer(), g->op_registry());
@@ -947,6 +994,9 @@ Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                                      &gdef.library(), g, &refiner, nullptr);
 }
 
+
+
+
 Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
                               gtl::ArraySlice<NodeDef> nodes, Graph* g) {
   ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, g->op_registry());
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index acb8498142..f9c4a7f6f7 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -50,6 +50,21 @@ bool IsExit(const NodeDef& node) {
   return op == "Exit" || op == "RefExit";
 }
 
+bool IsCall(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Call" || op == "RefCall";
+}
+
+bool IsReturn(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Return" || op == "RefReturn";
+}
+
+bool IsNextCall(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "NextCall" || op == "RefNextCall";
+}
+
 bool IsIdentity(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Identity" || op == "RefIdentity";
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 0de954fcb4..4932811122 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -27,6 +27,9 @@ bool IsConstant(const NodeDef& node);
 bool IsDequeueOp(const NodeDef& node);
 bool IsEnter(const NodeDef& node);
 bool IsExit(const NodeDef& node);
+bool IsCall(const NodeDef& node);
+bool IsReturn(const NodeDef& node);
+bool IsNextCall(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsMerge(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 665a72264f..62dd1eeeb7 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -324,7 +324,7 @@ namespace tensorflow {
                 new_merge = optimized_graph->add_node();
 
                 // Initialize new node
-                name = strings::StrCat(name, merge->input_size()-i-1);
+                name = strings::StrCat(name, size-i-1);
                 new_merge->set_name(name);
                 new_merge->set_op("Merge");
                 new_merge->set_device(func_node.device());
@@ -539,7 +539,7 @@ namespace tensorflow {
           /******************************************************************************************************/
           // Dumps optimized graph in a not so readable form
           const GraphDef* tmp = optimized_graph;
-          printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
+          //printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
 
           // Write an event, so that we can visualize this optimized graph in tensorboard
           EventsWriter writer("INLINE");
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 39a004c601..2e5ae8f025 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -100,7 +100,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   } else {
     std::set<string> available_optimizers = {"pruning",      "constfold",
-											 "function_transformation",
+											                        "function_transformation",
                                              "layout",       "memory",
                                              "autoparallel", "arithmetic"};
     for (const auto& optimizer : cfg_.optimizers()) {
@@ -126,7 +126,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           optimizer->Optimize(cluster, optimized_item, optimized_graph));
     }
   }
-  TopologicalSort(optimized_graph);
+  //TopologicalSort(optimized_graph);
 
   // Make sure that the optimizers preserved the graph version and library.
   DCHECK_GE(optimized_graph->library().function_size(),
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 61089658d7..c85a41b2a8 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -309,31 +309,155 @@ data: The tensor to be made available to the next iteration.
 output: The same tensor as `data`.
 )doc");
 
+// --------------------------------------------------------------------------
+
+REGISTER_OP("Call")
+    .Input("data: T")
+    .Output("output: T")
+    .Attr("T: list(type)")
+    .Attr("frame_name: string")
+    .Attr("parallel_calls: int = 10")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+
+      std::vector<ShapeHandle> input;
+      TF_RETURN_IF_ERROR(c->input("data", &input));
+      TF_RETURN_IF_ERROR(c->set_output("output", input));
+
+      return Status::OK();
+    })
+    .Doc(R"Doc(
+Creates (or finds) a child frame, and makes `data` available to the child frame.
+
+This op is used together with `Return` to create recursive calls in the graph.
+The unique `frame_name` is used by the `Executor` to identify frames.
+At most `parallel_calls` recursive calls
+are run in parallel in the child frame.
+
+data: The tensor to be made available to the child frame.
+frame_name: The name of the child frame.
+parallel_calls: The number of recursive calls allowed to run in parallel.
+output: The same tensor as `data`.
+
+Returns a list of tensors with the same shapes and contents as the input
+tensors.
+    )Doc");
+
+REGISTER_OP("RefCall")
+    .Input("data: Ref(T)")
+    .Output("output: Ref(T)")
+    .Attr("T: type")
+    .Attr("frame_name: string")
+    .Attr("parallel_calls: int = 10")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"Doc(
+Creates (or finds) a child frame, and makes `data` available to the child frame.
+
+This op is used together with `Return` to create recursive calls in the graph.
+The unique `frame_name` is used by the `Executor` to identify frames.
+At most `parallel_calls` recursive calls
+are run in parallel in the child frame.
+
+data: The tensor to be made available to the child frame.
+frame_name: The name of the child frame.
+parallel_calls: The number of recursive calls allowed to run in parallel.
+output: The same tensor as `data`.
+
+Returns a list of tensors with the same shapes and contents as the input
+tensors.
+    )Doc");
+
+// --------------------------------------------------------------------------
+
+REGISTER_OP("Return")
+.Input("data: T")
+.Output("output: T")
+.Attr("T: list(type)")
+.SetShapeFn([](shape_inference::InferenceContext* c) {
+
+std::vector<ShapeHandle> input;
+TF_RETURN_IF_ERROR(c->input("data", &input));
+TF_RETURN_IF_ERROR(c->set_output("output", input));
+
+return Status::OK();
+})
+.Doc(R"Doc(
+Exits the current frame to its parent frame.
+Exit makes its input `data` available to the parent frame.
+data: The list of tensors to be made available to the parent frame.
+output: The same list of tensors as `data`.
+    )Doc");
+
+REGISTER_OP("RefReturn")
+.Input("data: Ref(T)")
+.Output("output: Ref(T)")
+.Attr("T: type")
+.SetShapeFn(shape_inference::UnchangedShape)
+.Doc(R"Doc(
+Exits the current frame to its parent frame.
+Exit makes its input `data` available to the parent frame.
+data: The list of tensors to be made available to the parent frame.
+output: The same list of tensors as `data`.
+    )Doc");
+
+// --------------------------------------------------------------------------
+
+REGISTER_OP("NextCall")
+.Input("data: T")
+.Output("output: T")
+.Attr("T: list(type)")
+.SetShapeFn([](shape_inference::InferenceContext* c) {
+
+std::vector<ShapeHandle> input;
+TF_RETURN_IF_ERROR(c->input("data", &input));
+TF_RETURN_IF_ERROR(c->set_output("output", input));
+
+return Status::OK();
+})
+.Doc(R"Doc(
+Makes its input available to the next iteration.
+
+data: The list of tensors to be made available to the next iteration.
+output: The same list of tensors as `data`.
+    )Doc");
+
+REGISTER_OP("RefNextCall")
+.Input("data: Ref(T)")
+.Output("output: Ref(T)")
+.Attr("T: type")
+.SetShapeFn(shape_inference::UnchangedShape)
+.Doc(R"Doc(
+Makes its input available to the next iteration.
+
+data: The list of tensors to be made available to the next iteration.
+output: The same list of tensors as `data`.
+    )Doc");
+
+
 // --------------------------------------------------------------------------
 REGISTER_OP("LoopCond")
     .Input("input: bool")
     .Output("output: bool")
     .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRank(c, 0);
+    return shape_inference::UnchangedShapeWithRank(c, 0);
     })
     .Doc(R"doc(
-Forwards the input to the output.
+    Forwards the input to the output.
 
-This operator represents the loop termination condition used by the
-"pivot" switches of a loop.
+    This operator represents the loop termination condition used by the
+    "pivot" switches of a loop.
 
-input: A boolean scalar, representing the branch predicate of the Switch op.
-output: The same tensor as `input`.
-)doc");
+    input: A boolean scalar, representing the branch predicate of the Switch op.
+    output: The same tensor as `input`.
+    )doc");
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ControlTrigger")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"docstring(
-Does nothing. Serves as a control trigger for scheduling.
+    Does nothing. Serves as a control trigger for scheduling.
 
-Only useful as a placeholder for control edges.
-)docstring");
+    Only useful as a placeholder for control edges.
+    )docstring");
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Abort")
@@ -341,14 +465,22 @@ REGISTER_OP("Abort")
     .Attr("exit_without_error: bool = false")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
-Raise a exception to abort the process when called.
+    Raise a exception to abort the process when called.
+
+    If exit_without_error is true, the process will exit normally,
+    otherwise it will exit with a SIGABORT signal.
+
+    Returns nothing but an exception.
+
+    error_msg: A string which is the message associated with the exception.
+    )doc");
+
+
+
+
+
 
-If exit_without_error is true, the process will exit normally,
-otherwise it will exit with a SIGABORT signal.
 
-Returns nothing but an exception.
 
-error_msg: A string which is the message associated with the exception.
-)doc");
 
 }  // namespace tensorflow

From cf1b2e89f6a06ddbcb8bfe00e0e238211cc4f19b Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Tue, 19 Jun 2018 20:18:18 +0300
Subject: [PATCH 06/64] Transformation v.3

- I broke Call/Return (IdentityN-like) nodes to
  multiple Call/Return (Identity-like).

- I added a 'NextCall' op in transformation.
---
 .../common_runtime/graph_execution_state.cc   |   3 +-
 tensorflow/core/graph/graph_constructor.cc    |  11 +-
 .../optimizers/function_transformation.cc     | 284 ++++++++++--------
 3 files changed, 175 insertions(+), 123 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index dcc47c07fe..d6660161da 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -362,8 +362,7 @@ Status GraphExecutionState::OptimizeGraph(
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     optimized_graph->reset(new Graph(OpRegistry::Global()));
-    TF_RETURN_IF_ERROR(
-        ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
 /*******************************************************************************************/
     // Write an event, so that we can visualize this optimized graph in tensorboard
     EventsWriter writer("Fully_Optimized");
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 968766f80d..9d126d03d0 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -139,14 +139,23 @@ class GraphConstructor {
         g_(g),
         original_versions_(g->versions()),
         refiner_(refiner),
-        return_tensors_(return_tensors) {}
+        return_tensors_(return_tensors) {
+
+    printf(" Graph Constructor\n");
+  }
 
   Status TryImport() {
+    printf(" ensure No name collisions\n");
     TF_RETURN_IF_ERROR(EnsureNoNameCollisions());
+    printf(" validate input map and ctrl dep\n");
     TF_RETURN_IF_ERROR(ValidateInputMapAndControlDependencies());
+    printf(" build node index\n");
     TF_RETURN_IF_ERROR(BuildNodeIndex());
+    printf(" init from edges\n");
     TF_RETURN_IF_ERROR(InitFromEdges());
+    printf(" convert\n");
     TF_RETURN_IF_ERROR(Convert());
+    printf(" back edges\n");
     TF_RETURN_IF_ERROR(AddBackEdges());
     TF_RETURN_IF_ERROR(UpdateVersionDef());
     TF_RETURN_IF_ERROR(PopulateReturnTensors());
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 62dd1eeeb7..0404cea80f 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_transformation.h"
+#include <set>
+#include <iostream>
 #include <unordered_map>
 #include "tensorflow/core/util/event.pb.h"
 #include "tensorflow/core/util/events_writer.h"
@@ -87,15 +89,15 @@ namespace tensorflow {
                 TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
             };
 
-          // Copy input/output argument type to the type_list. Return error if argument
+          // Copy input/output argument type to the type. Return error if argument
           // type is not explicitly defined, and not specified in function attributes.
           Status CopyArgType(const NodeDef& func_node,
                              const std::unordered_map<string, AttrValue>& func_attr,
                              const string& arg_kind, const OpDef::ArgDef& arg,
-                             AttrValue::ListValue* type_list) {
+                             DataType* type) {
 
             if (arg.type() != DT_INVALID) {
-              type_list->add_type(arg.type());
+              *type = arg.type();
             } else {
               auto it = func_attr.find(arg.type_attr());
               if (it == func_attr.end() || it->second.type() == DT_INVALID) {
@@ -103,86 +105,97 @@ namespace tensorflow {
                         "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
                         func_node.op(), " instantiated by ", func_node.name());
               }
-              type_list->add_type(it->second.type());
+              *type = it->second.type();
             }
             return Status::OK();
           }
 
-          // Add an IdentityN op to hook the function inputs to: this ensures that
-          // they're all evaluated before the evaluation of the function body starts.
-          Status HookInlinedFunctionInputs(
-                  const NodeDef& func_node, const FunctionDef& func,
-                  const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
-
-            inputs->set_name(strings::StrCat(func_node.name(), "/", "Enter"));
-            inputs->set_op("IdentityN");
-            inputs->set_device(func_node.device());
-            *inputs->mutable_input() = func_node.input();       //IdentityN node steals the inputs from Func node
-            AttrValue::ListValue* type_list = (*inputs->mutable_attr())["T"].mutable_list();
-            for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, type_list));
+          string ParseString(string input) {
+
+            size_t pos = 0;
+            std::string res = "";
+            std::string delimiter = ":";
+
+            if ((pos = input.find(delimiter)) != std::string::npos) {
+              res = res + input.substr(0, pos);
+              input.erase(0, pos + delimiter.length());
+              res = res + "/Ret" + input;
             }
-            return Status::OK();
+            else {
+              res = input + "/Ret0";
+            }
+
+            std::cout << res << std::endl;
+
+            return res;
           }
 
-          // Add an IdentityN op to hook the function outputs to: this ensures that the
-          // function body is fully evaluated before its fanout gets scheduled.
-          Status HookInlinedFunctionOutputs(
-                  const NodeDef& func_node, const FunctionDef& func,
-                  const std::unordered_map<string, AttrValue>& func_attr,
-                  const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
-
-            // Exit Node
-            outputs->set_name(func_node.name());//strings::StrCat(func_node.name(), "/", "Exit"));
-            outputs->set_op("IdentityN");
-            outputs->set_device(func_node.device());
-            AttrValue::ListValue* type_list = (*outputs->mutable_attr())["T"].mutable_list();
-            for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-              const OpDef::ArgDef& arg = func.signature().output_arg(i);
-              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, type_list));
-              // Use the fetch names since they take into account the output mapping.
-              outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+          Status GatherOutputs(std::set<string> &foutputs, const GrapplerItem& item,
+                               const FunctionInliningContext& function_inlining_ctx) {
+
+            for (const NodeDef& node : item.graph.node()) {
+
+              const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
+              if (func != nullptr) {      // If it's a function calling node
+
+                for (int i = 0; i < func->signature().output_arg_size(); ++i) {
+                 // const OpDef::ArgDef &arg = func->signature().output_arg(i);
+                  foutputs.emplace(node.name());                   // Fac
+                  foutputs.emplace(strings::StrCat(node.name(), ":", i));      // Fac:i
+                  //foutputs.emplace(strings::StrCat(node.name(), ":", arg.name(), ":", i));      // Fac:outarg:i
+                }
+              }
             }
             return Status::OK();
           }
 
 
           Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
-                             std::unordered_map<string, FuncInfo> functions_in) {
+                                std::unordered_map<string, FuncInfo> &functions_in) {
 
             printf("Recursion Detected\n");
 
             const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
-            NodeDef* merge;
+            DataType type;
             ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
 
-            // Hook inlined function inputs to Enter node
-            NodeDef* func_inputs = optimized_graph->add_node();
-            HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
-
-            // Hook Enter node's outputs to func's Merges nodes
             for (int i = 0; i < func.signature().input_arg_size(); ++i) {
               const OpDef::ArgDef &arg = func.signature().input_arg(i);
 
-              merge = argmerge_map[arg.name()];
-              merge->add_input(strings::StrCat(func_inputs->name(), ":", i));
+              // Create and add in graph a Call node for every input arg
+              NodeDef *call = optimized_graph->add_node();
+              call->set_name(strings::StrCat(func_node.name(), "/", "NextCall_", arg.name()));
+              call->set_op("NextCall");
+              call->set_device(func_node.device());
+              call->add_input(func_node.input(i));
+              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
+              (*call->mutable_attr())["T"].set_type(type);
+
+
+              NodeDef* merge = argmerge_map[arg.name()];
+              merge->add_input(call->name());
             }
 
-            // Hook inlined function outputs to Exit node
-            string name = func_node.name();
-            func_node.set_name(func_node.op());
-            NodeDef* func_outputs = optimized_graph->add_node();
-            HookInlinedFunctionOutputs(func_node, func, func_attr, functions_in[func_node.op()].fetch, func_outputs);
-            // Re-set node's name - I wanted to avoid changing HookInlinedFunctionOutputs
-            func_outputs->set_name(name);
+            for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+              const OpDef::ArgDef &arg = func.signature().output_arg(i);
+
+              NodeDef *ret = optimized_graph->add_node();
+              ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
+              ret->set_op("Return");
+              ret->set_device(func_node.device());
+              // Counting on the fact that op name will be the same as the name given initially to function
+              ret->add_input(strings::StrCat(func_node.op(), "/", functions_in[func_node.op()].fetch[i]));
+              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
+              (*ret->mutable_attr())["T"].set_type(type);
+            }
 
             return Status::OK();
           }
 
 
           Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
-                              GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> functions_in) {
+                              GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> &functions_in) {
 
             const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
@@ -191,33 +204,45 @@ namespace tensorflow {
               return errors::InvalidArgument("Failed to inline function ", func_node.op(), " instantiated by ", func_node.name());
             }
 
+            std::set<string> foutputs;
+            GatherOutputs(foutputs, *item, ctx);
+
+std::cout << foutputs.size() << '\n';
+for( const auto& str : foutputs ) std::cout << str << '\n';
+
+            DataType type;
+            std::unordered_map<string, int> input_nodes;
             functions_in[func_node.op()].fetch = item->fetch;
             ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
-            NodeDef* merge;
-            std::unordered_map<string, int> input_nodes;
+
             for (int i = 0; i < func.signature().input_arg_size(); ++i) {
               const OpDef::ArgDef& arg = func.signature().input_arg(i);
+
               input_nodes[arg.name()] = i;
 
-              // Create a merge node for every input arg
-              merge = optimized_graph->add_node();
-              // Initialize merge
+              // Create and add in graph a Call node for every input arg
+              NodeDef* call = optimized_graph->add_node();
+              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", arg.name()));
+              call->set_op("Call");
+              call->set_device(func_node.device());
+              call->add_input(func_node.input(i));
+              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
+              (*call->mutable_attr())["T"].set_type(type);
+
+              // Create and add a temporary merge node (IdentityN) for every input arg
+              NodeDef* merge = optimized_graph->add_node();
               merge->set_name(strings::StrCat(func_node.name(), "/", "Merge_", arg.name()));
               merge->set_op("IdentityN");
               merge->set_device(func_node.device());
+              merge->add_input(call->name());
 
-              // Merge's  Attrs will be initialized later
               argmerge_map.emplace(arg.name(), merge);
-
             }
 
-            // Hook inlined function inputs to Enter node
-            NodeDef* func_inputs = optimized_graph->add_node();
-            HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs);
 
             for (NodeDef& func_body_node : *item->graph.mutable_node()) {
 
-              // If the func body node is func's input argument, these nodes will now take input from IdentityN
+              // If the func body node is func's input argument
               if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
                 CHECK_EQ(0, func_body_node.input_size());
                 // Turn input placeholders into identity nodes
@@ -226,23 +251,27 @@ namespace tensorflow {
                 }
                 // Connect merge with input arg
                 func_body_node.add_input(argmerge_map[func_body_node.name()]->name());
-                // Connect IdentityN to merge
-                int input_id = input_nodes[func_body_node.name()];
-                argmerge_map[func_body_node.name()]->add_input(strings::StrCat(func_inputs->name(), ":", input_id));
               }
 
               // Else if not an input_arg_node
               else {
                 // Update the input names if any.
                 for (string& input : *func_body_node.mutable_input()) {
+
+                  // If it takes input from a function
+                  if (foutputs.find(input) != foutputs.end()) {
+                    input = ParseString(input);
+                  }
                   input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
                 }
+                /*
                 // If the node has no input, make hook it up to the func_inputs node to
                 // ensure it runs in the same frame as the other nodes of the function
                 // body.
                 if (func_body_node.input_size() == 0) {
                   *func_body_node.add_input() = AsControlDependency(func_inputs->name());
                 }
+                 */
               }
 
               // Add the node name as a prefix to avoid collisions after inlining
@@ -280,70 +309,70 @@ namespace tensorflow {
               }
             }
 
-            // Hook inlined function outputs to Exit node
-            NodeDef* func_outputs = optimized_graph->add_node();
-            HookInlinedFunctionOutputs(func_node, func, func_attr, item->fetch, func_outputs);
+            for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+              const OpDef::ArgDef &arg = func.signature().output_arg(i);
+
+              NodeDef *ret = optimized_graph->add_node();
+              ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
+              ret->set_op("Return");
+              ret->set_device(func_node.device());
+              // If it takes input from a function
+              string input = item->fetch[i];
+              if (foutputs.find(input) != foutputs.end()) {
+                input = ParseString(input);
+              }
 
-            // Break Merges into multiple common Merge ops and fix their attrs
-            int j=0;
+              ret->add_input(strings::StrCat(func_node.name(), "/", input));
+              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
+              (*ret->mutable_attr())["T"].set_type(type);
+            }
 
+            // Break IdentityN Merges into multiple common Merge ops
+            int j=0;
             for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
 
-              NodeDef *new_merge;
-              merge = it->second;
-
               DataType type;
-              const OpDef::ArgDef& arg = func.signature().input_arg(j);
-
-              if (arg.type() != DT_INVALID) {
-                type = arg.type();
-              } else {
-                auto it = func_attr.find(arg.type_attr());
-                if (it == func_attr.end() || it->second.type() == DT_INVALID) {
-                  return errors::InvalidArgument(
-                          "Invalid argument ", arg.name(), " for function ",
-                          func_node.op(), " instantiated by ", func_node.name());
-                }
-                type = it->second.type();
-              }
-
+              NodeDef *new_merge, *merge = it->second;
               int i, size = merge->input_size();
+              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", func.signature().input_arg(j), &type));
 
-              // If there is only one call site, leave Merge as it is (IdentityN node)
-              // and it will be eliminated by other optimizers
-              if (size < 2)
-                break;
+              // If there is only one call site
+              if (size < 2) {
+                merge->set_op("Identity");
+                merge->set_device(func_node.device());
+                (*merge->mutable_attr())["T"].set_type(type);
+              }
 
-              string name = merge->name();
-              string in1 = merge->input(0), in2;
+              else {
 
-              for (i=1; i < size-1; i++) {
+                string name = merge->name();
+                string in1 = merge->input(0), in2;
 
-                in2 = merge->input(i);
+                for (i = 1; i < size-1; i++) {
 
-                new_merge = optimized_graph->add_node();
+                  in2 = merge->input(i);
+                  new_merge = optimized_graph->add_node();
 
-                // Initialize new node
-                name = strings::StrCat(name, size-i-1);
-                new_merge->set_name(name);
-                new_merge->set_op("Merge");
-                new_merge->set_device(func_node.device());
-                new_merge->add_input(in1);
-                new_merge->add_input(in2);
-                (*new_merge->mutable_attr())["T"].set_type(type);
+                  name = strings::StrCat(name, size - i - 1);
+                  new_merge->set_name(name);
+                  new_merge->set_op("Merge");
+                  new_merge->set_device(func_node.device());
+                  new_merge->add_input(in1);
+                  new_merge->add_input(in2);
+                  (*new_merge->mutable_attr())["T"].set_type(type);
 
-                in1 = name;
-              }
-
-              // Modify initial Merge
-              in2 = merge->input(i);
-              merge->set_op("Merge");
-              merge->set_device(func_node.device());
-              merge->clear_input();
-              merge->add_input(in1);
-              merge->add_input(in2);
-              (*merge->mutable_attr())["T"].set_type(type);
+                  in1 = name;
+                }
 
+                // Modify initial Merge
+                in2 = merge->input(i);
+                merge->set_op("Merge");
+                merge->set_device(func_node.device());
+                merge->clear_input();
+                merge->add_input(in1);
+                merge->add_input(in2);
+                (*merge->mutable_attr())["T"].set_type(type);
+              }
             }
 
             return Status::OK();
@@ -497,12 +526,18 @@ namespace tensorflow {
 
 
         Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                           GraphDef* optimized_graph) {
+                                                GraphDef* optimized_graph) {
 
           printf("Function Transformation: Enabled By Default\n");
 
           FunctionInliningContext function_inlining_ctx(item);
 
+          std::set<string> foutputs;
+          GatherOutputs(foutputs, item, function_inlining_ctx);
+
+std::cout << foutputs.size() << '\n';
+for( const auto& str : foutputs ) std::cout << str << '\n';
+
           // Nothing to do here.
           if (!function_inlining_ctx.HasInlinedFunctions()) {
             *optimized_graph = item.graph;
@@ -511,15 +546,23 @@ namespace tensorflow {
 
 //          SymbolicGradientEnv env(item.graph.versions().producer(),item.graph.library());
 
-//          std::unordered_map<string, ArgMerge> argmerge_map;
           std::unordered_map<string, FuncInfo> functions_in;
 
-          for (const NodeDef& node : item.graph.node()) {
+          // Copying node cause I need to make changes on it
+          for (NodeDef node : item.graph.node()) {
 //            if (node.op() == "SymbolicGradient") {
 //              TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
 //              continue;
 //            }
 
+            for (string& input : *node.mutable_input()) {
+
+              // If it takes input from a function
+              if (foutputs.find(input) != foutputs.end()) {
+                input = ParseString(input);
+              }
+            }
+
             const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
             if (func != nullptr) {
               FuncInfo func_info;
@@ -535,11 +578,12 @@ namespace tensorflow {
           *optimized_graph->mutable_versions() = item.graph.versions();
           *optimized_graph->mutable_library() = item.graph.library();
 
+          // delete set
 
           /******************************************************************************************************/
           // Dumps optimized graph in a not so readable form
           const GraphDef* tmp = optimized_graph;
-          //printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
+          printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
 
           // Write an event, so that we can visualize this optimized graph in tensorboard
           EventsWriter writer("INLINE");

From 5b8715e68dcebad940e088c64506861d1fdfaf44 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Wed, 20 Jun 2018 23:17:53 +0300
Subject: [PATCH 07/64] Transformation_v.4

- removed NextCall op
- fixed convertToGraph / transformation now makes it to Executor
---
 TESTS/mutrec.py                               |  31 ++++++
 tensorflow/core/graph/graph.cc                |   1 -
 tensorflow/core/graph/graph.h                 |   7 +-
 tensorflow/core/graph/graph_constructor.cc    | 103 ++++++++----------
 tensorflow/core/grappler/op_types.cc          |   5 -
 tensorflow/core/grappler/op_types.h           |   1 -
 .../optimizers/function_transformation.cc     |  23 ++--
 .../grappler/optimizers/meta_optimizer.cc     |   8 +-
 tensorflow/core/ops/control_flow_ops.cc       |  75 ++++---------
 9 files changed, 114 insertions(+), 140 deletions(-)
 create mode 100644 TESTS/mutrec.py

diff --git a/TESTS/mutrec.py b/TESTS/mutrec.py
new file mode 100644
index 0000000000..370f3793a5
--- /dev/null
+++ b/TESTS/mutrec.py
@@ -0,0 +1,31 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+f = function.Declare("F", [("n", tf.int32)], [("ret", tf.int32)])
+g = function.Declare("G", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="F", out_names=["ret"])
+def FImpl(n):
+	return tf.cond(tf.less_equal(n, 1),
+		lambda: tf.constant(1),
+		lambda: g(n - 1))
+
+@function.Defun(tf.int32, func_name="G", out_names=["ret"])
+def GImpl(n):
+	return f(n)
+
+# Building the graph.
+
+FImpl.add_to_graph(tf.get_default_graph())
+GImpl.add_to_graph(tf.get_default_graph())
+
+
+n = tf.placeholder(tf.int32, name="MyPlaceHolder")
+x = f(n)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session() as sess:			# no need to manually close the session
+	print(sess.run([x], feed_dict={n:4}))
+
+writer.close()
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 948d364ddd..ac9b78b03b 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -63,7 +63,6 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         REF_CLASS("NextIteration", NC_NEXT_ITERATION),
         REF_CLASS("Call", NC_CALL),
         REF_CLASS("Return", NC_RETURN),
-        REF_CLASS("NextCall", NC_NEXT_CALL),
         {"LoopCond", NC_LOOP_COND},
         {"ControlTrigger", NC_CONTROL_TRIGGER},
         {"_Send", NC_SEND},
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index e7b81b0531..e3a72f6acf 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -142,9 +142,8 @@ class Node {
   bool IsEnter() const { return class_ == NC_ENTER; }
   bool IsExit() const { return class_ == NC_EXIT; }
   bool IsNextIteration() const { return class_ == NC_NEXT_ITERATION; }
-  bool IsCall() const { return class_ == NC_ENTER; }
-  bool IsReturn() const { return class_ == NC_EXIT; }
-  bool IsNextCall() const { return class_ == NC_NEXT_ITERATION; }
+  bool IsCall() const { return class_ == NC_CALL; }
+  bool IsReturn() const { return class_ == NC_RETURN; }
   bool IsLoopCond() const { return class_ == NC_LOOP_COND; }
   bool IsControlTrigger() const { return class_ == NC_CONTROL_TRIGGER; }
   bool IsSend() const { return class_ == NC_SEND || class_ == NC_HOST_SEND; }
@@ -224,7 +223,6 @@ class Node {
     NC_NEXT_ITERATION,
     NC_CALL,
     NC_RETURN,
-    NC_NEXT_CALL,
     NC_LOOP_COND,
     NC_CONTROL_TRIGGER,
     NC_SEND,
@@ -654,7 +652,6 @@ inline bool IsExit(const Node* node) { return node->IsExit(); }
 inline bool IsNextIteration(const Node* n) { return n->IsNextIteration(); }
 inline bool IsCall(const Node* node) { return node->IsCall(); }
 inline bool IsReturn(const Node* node) { return node->IsCall(); }
-inline bool IsNextCall(const Node* n) { return n->IsNextCall(); }
 inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
 inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
 inline bool IsSend(const Node* node) { return node->IsSend(); }
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 9d126d03d0..b5efa49926 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -52,11 +52,17 @@ inline bool IsNextIteration(const NodeDef& node_def) {
          node_def.op() == "RefNextIteration";
 }
 
-inline bool IsNextCall(const NodeDef& node_def) {
-  return node_def.op() == "NextCall" ||
-         node_def.op() == "RefNextCall";
+inline bool IsCall(const NodeDef& node_def) {
+  return node_def.op() == "Call" ||
+         node_def.op() == "RefCall";
 }
 
+inline bool IsReturn(const NodeDef& node_def) {
+      return node_def.op() == "Return" ||
+             node_def.op() == "RefReturn";
+    }
+
+
 bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
   using ::tensorflow::strings::Scanner;
   return Scanner(s)
@@ -139,23 +145,14 @@ class GraphConstructor {
         g_(g),
         original_versions_(g->versions()),
         refiner_(refiner),
-        return_tensors_(return_tensors) {
-
-    printf(" Graph Constructor\n");
-  }
+        return_tensors_(return_tensors) {}
 
   Status TryImport() {
-    printf(" ensure No name collisions\n");
     TF_RETURN_IF_ERROR(EnsureNoNameCollisions());
-    printf(" validate input map and ctrl dep\n");
     TF_RETURN_IF_ERROR(ValidateInputMapAndControlDependencies());
-    printf(" build node index\n");
     TF_RETURN_IF_ERROR(BuildNodeIndex());
-    printf(" init from edges\n");
     TF_RETURN_IF_ERROR(InitFromEdges());
-    printf(" convert\n");
     TF_RETURN_IF_ERROR(Convert());
-    printf(" back edges\n");
     TF_RETURN_IF_ERROR(AddBackEdges());
     TF_RETURN_IF_ERROR(UpdateVersionDef());
     TF_RETURN_IF_ERROR(PopulateReturnTensors());
@@ -184,15 +181,12 @@ class GraphConstructor {
   // input_already_exists is a pre-initialized vector of length
   // node_def->input_size(). This function will mark inputs that are remapped to
   // true.
-  void RemapNodeDefInputs(NodeDef* node_def,
-                          std::vector<bool>* input_already_exists);
+  void RemapNodeDefInputs(NodeDef* node_def, std::vector<bool>* input_already_exists);
   // input_already_exists is a pre-initialized vector of length
   // node_def->input_size(). This function will add and mark control inputs as
   // true.
-  void AddControlDependencies(NodeDef* node_def,
-                              std::vector<bool>* input_already_exists);
-  void AddPrefixToNodeDef(const std::vector<bool>& input_already_exists,
-                          NodeDef* node_def);
+  void AddControlDependencies(NodeDef* node_def, std::vector<bool>* input_already_exists);
+  void AddPrefixToNodeDef(const std::vector<bool>& input_already_exists, NodeDef* node_def);
 
   // From constructor
   const Options opts_;
@@ -260,8 +254,6 @@ class GraphConstructor {
 };
 
 
-
-
 // This could be expensive but we don't expect to call it often, if at all (only
 // if there are multiple nodes in g_ with the same name)
 bool NodeNameInValues(const std::map<TensorId, TensorId>& input_map,
@@ -393,43 +385,28 @@ Status GraphConstructor::BuildNodeIndex() {
   return Status::OK();
 }
 
-std::unordered_set<string> GetNextIterationNodes(
+std::unordered_set<string> GetNextIterationCallNodes(
     const GraphConstructor::NodeDefSlice& node_defs) {
-  std::unordered_set<string> next_iteration_nodes;
 
-  for (int n = 0; n < node_defs.size(); ++n) {
-    const NodeDef& node_def = *node_defs[n];
-    if (IsNextIteration(node_def)) {
-      next_iteration_nodes.insert(node_def.name());
-    }
-  }
-
-  return next_iteration_nodes;
-}
-
-
-std::unordered_set<string> GetNextCallNodes(
-        const GraphConstructor::NodeDefSlice& node_defs) {
-  std::unordered_set<string> next_call_nodes;
+  std::unordered_set<string> next_iteration_call_nodes;
 
   for (int n = 0; n < node_defs.size(); ++n) {
     const NodeDef& node_def = *node_defs[n];
-    if (IsNextCall(node_def)) {
-      next_call_nodes.insert(node_def.name());
+    if (IsNextIteration(node_def) || IsCall(node_def)) {
+      next_iteration_call_nodes.insert(node_def.name());
     }
   }
 
-  return next_call_nodes;
+  return next_iteration_call_nodes;
 }
 
 
+
 Status GraphConstructor::InitFromEdges() {
   const int num_nodes = node_defs_.size();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
-  std::unordered_set<string> next_iteration_nodes_ = GetNextIterationNodes(node_defs_);
-  std::unordered_set<string> next_call_nodes_ = GetNextCallNodes(node_defs_);
-
+  std::unordered_set<string> next_iteration_call_nodes_ = GetNextIterationCallNodes(node_defs_);
 
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
@@ -438,9 +415,7 @@ Status GraphConstructor::InitFromEdges() {
     if (IsMerge(node_def)) {
       // Cycles in the graph are only allowed for while loops and recursion.
       // A while loop is identified by an edge from a NextIteration node to a Merge node.
-
       // A recursion is identified by an edge from a NextCall Node to a Merge node
-
       // For such Merge nodes, only wait for one non-control input before
       // considering the node ready to process in Convert().
       int32 num_control_edges = 0;
@@ -455,11 +430,7 @@ Status GraphConstructor::InitFromEdges() {
         }
         else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_nodes_.find(id.first.ToString()) !=next_iteration_nodes_.end()) {
-            has_loop_back_edge = true;
-          }
-
-          if (next_call_nodes_.find(id.first.ToString()) != next_call_nodes_.end()) {
+          if (next_iteration_call_nodes_.find(id.first.ToString()) != next_iteration_call_nodes_.end()) {
             has_loop_back_edge = true;
           }
         }
@@ -471,6 +442,22 @@ Status GraphConstructor::InitFromEdges() {
       }
     }
 
+    // Does not necessarily mean cycle though - maybe I should find a better condition
+    else if (IsReturn(node_def)) {
+      int32 num_control_edges = 0;
+
+      for (int i = 0; i < node_def.input_size(); ++i) {
+
+        StringPiece input_name(node_def.input(i));
+
+        if (input_name.starts_with("^")) {
+          num_control_edges++;
+        }
+      }
+
+      pending_count_.push_back(num_control_edges);
+      ready_.push_back(n);
+    }
 
     else {
       pending_count_.push_back(node_def.input_size());
@@ -482,7 +469,6 @@ Status GraphConstructor::InitFromEdges() {
       continue;
     }
 
-
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
       TensorId id(ParseTensorName(input_name));
@@ -519,7 +505,7 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
   // Add the node to the graph.
   Status status;
   *node = g_->AddNode(node_def, &status);
-  if (!status.ok()) return status;
+  if (!status.ok()) {return status;}
   if (opts_.expect_device_spec) {
     (*node)->set_assigned_device_name(node_def.device());
   }
@@ -771,6 +757,7 @@ Status GraphConstructor::Convert() {
   // inputs, pending_counts_ with the number of inputs for each node and
   // outputs_ with the outputs of each node).
   while (!ready_.empty()) {
+
     int o = ready_.back();
     ready_.pop_back();
     ++processed;
@@ -800,10 +787,6 @@ Status GraphConstructor::Convert() {
         }
       }
 
-      // TODO(ashankar): The line below means an additional copy of the NodeDef,
-      // which can be expensive if the NodeDef contains large tensors in it.
-      // Might make sense to change the API for ImportGraphDef to take a mutable
-      // GraphDef* and avoid the copying.
       imported_node_def = original_node_def;
       if (!opts_.input_map.empty()) {
         // Note that input_already_exists can shrink here
@@ -814,7 +797,8 @@ Status GraphConstructor::Convert() {
         AddControlDependencies(&imported_node_def, &input_already_exists);
       }
       node_def = &imported_node_def;
-    } else {
+    }
+    else {
       node_def = &original_node_def;
     }
 
@@ -850,7 +834,7 @@ Status GraphConstructor::Convert() {
       inputs.push_back(InputInfo(id.first.ToString(), src_node, src_index));
     }
 
-    if (has_data_back_edge && !IsMerge(*node_def)) {
+    if (has_data_back_edge && !IsMerge(*node_def) && !IsReturn(*node_def)) {
       return errors::InvalidArgument(
           "Node '", node_def->name(),
           "' had a back edge, but only Merge nodes can have back edges.");
@@ -879,8 +863,6 @@ Status GraphConstructor::Convert() {
       }
     }
 
-    // TODO(skyewm): remove conditional when b/35715995 ("Functions lack shape
-    // inference") is resolved.
     if (g_->flib_def().Find(node_def->name()) == nullptr) {
       TF_RETURN_IF_ERROR(ValidateShape(node));
     }
@@ -893,6 +875,7 @@ Status GraphConstructor::Convert() {
     return errors::InvalidArgument(node_defs_.size() - processed,
                                    " nodes in a cycle");
   }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index f9c4a7f6f7..6b41f952b9 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -60,11 +60,6 @@ bool IsReturn(const NodeDef& node) {
   return op == "Return" || op == "RefReturn";
 }
 
-bool IsNextCall(const NodeDef& node) {
-  const auto& op = node.op();
-  return op == "NextCall" || op == "RefNextCall";
-}
-
 bool IsIdentity(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Identity" || op == "RefIdentity";
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 4932811122..6feab5bb3d 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -29,7 +29,6 @@ bool IsEnter(const NodeDef& node);
 bool IsExit(const NodeDef& node);
 bool IsCall(const NodeDef& node);
 bool IsReturn(const NodeDef& node);
-bool IsNextCall(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsMerge(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 0404cea80f..e65649b576 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -153,7 +153,7 @@ namespace tensorflow {
           Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
                                 std::unordered_map<string, FuncInfo> &functions_in) {
 
-            printf("Recursion Detected\n");
+//            printf("Recursion Detected\n");
 
             const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
@@ -165,8 +165,8 @@ namespace tensorflow {
 
               // Create and add in graph a Call node for every input arg
               NodeDef *call = optimized_graph->add_node();
-              call->set_name(strings::StrCat(func_node.name(), "/", "NextCall_", arg.name()));
-              call->set_op("NextCall");
+              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", arg.name()));
+              call->set_op("Call");
               call->set_device(func_node.device());
               call->add_input(func_node.input(i));
               TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
@@ -207,8 +207,8 @@ namespace tensorflow {
             std::set<string> foutputs;
             GatherOutputs(foutputs, *item, ctx);
 
-std::cout << foutputs.size() << '\n';
-for( const auto& str : foutputs ) std::cout << str << '\n';
+//std::cout << foutputs.size() << '\n';
+//for( const auto& str : foutputs ) std::cout << str << '\n';
 
             DataType type;
             std::unordered_map<string, int> input_nodes;
@@ -360,6 +360,7 @@ for( const auto& str : foutputs ) std::cout << str << '\n';
                   new_merge->add_input(in1);
                   new_merge->add_input(in2);
                   (*new_merge->mutable_attr())["T"].set_type(type);
+                  (*new_merge->mutable_attr())["N"].set_i(2);
 
                   in1 = name;
                 }
@@ -372,6 +373,7 @@ for( const auto& str : foutputs ) std::cout << str << '\n';
                 merge->add_input(in1);
                 merge->add_input(in2);
                 (*merge->mutable_attr())["T"].set_type(type);
+                (*merge->mutable_attr())["N"].set_i(2);
               }
             }
 
@@ -528,15 +530,13 @@ for( const auto& str : foutputs ) std::cout << str << '\n';
         Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
                                                 GraphDef* optimized_graph) {
 
-          printf("Function Transformation: Enabled By Default\n");
-
           FunctionInliningContext function_inlining_ctx(item);
 
           std::set<string> foutputs;
           GatherOutputs(foutputs, item, function_inlining_ctx);
 
-std::cout << foutputs.size() << '\n';
-for( const auto& str : foutputs ) std::cout << str << '\n';
+//std::cout << foutputs.size() << '\n';
+//for( const auto& str : foutputs ) std::cout << str << '\n';
 
           // Nothing to do here.
           if (!function_inlining_ctx.HasInlinedFunctions()) {
@@ -578,9 +578,8 @@ for( const auto& str : foutputs ) std::cout << str << '\n';
           *optimized_graph->mutable_versions() = item.graph.versions();
           *optimized_graph->mutable_library() = item.graph.library();
 
-          // delete set
 
-          /******************************************************************************************************/
+          /******************************************************************************************************
           // Dumps optimized graph in a not so readable form
           const GraphDef* tmp = optimized_graph;
           printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
@@ -601,7 +600,7 @@ for( const auto& str : foutputs ) std::cout << str << '\n';
           const void* bf = buf;
           event.set_graph_def(bf, proto_size);
           writer.WriteEvent(event);
-          /******************************************************************************************************/
+          ******************************************************************************************************/
 
           return Status::OK();
         }
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 2e5ae8f025..037678e3c0 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -66,14 +66,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!cfg_.disable_model_pruning()) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
-	if (cfg_.function_transformation() != RewriterConfig::OFF) {
-	  optimizers.push_back(
-			  std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));
-	}
     if (cfg_.constant_folding() != RewriterConfig::OFF) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
     }
+    if (cfg_.function_transformation() != RewriterConfig::OFF) {
+      optimizers.push_back(
+              std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));
+    }
     if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new ArithmeticOptimizer()));
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index c85a41b2a8..a1f4f37ed9 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -314,15 +314,25 @@ output: The same tensor as `data`.
 REGISTER_OP("Call")
     .Input("data: T")
     .Output("output: T")
-    .Attr("T: list(type)")
+    .Attr("T: type")
     .Attr("frame_name: string")
+    .Attr("is_constant: bool = false")
     .Attr("parallel_calls: int = 10")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-
-      std::vector<ShapeHandle> input;
-      TF_RETURN_IF_ERROR(c->input("data", &input));
-      TF_RETURN_IF_ERROR(c->set_output("output", input));
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
 
+      // Handle resource shape / dtype, if present.
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      } else {
+        // Otherwise, propagate shape if output is a constant.
+        bool is_constant;
+        TF_RETURN_IF_ERROR(c->GetAttr("is_constant", &is_constant));
+        if (is_constant) {
+         c->set_output(0, c->input(0));
+        }
+      }
       return Status::OK();
     })
     .Doc(R"Doc(
@@ -338,7 +348,7 @@ frame_name: The name of the child frame.
 parallel_calls: The number of recursive calls allowed to run in parallel.
 output: The same tensor as `data`.
 
-Returns a list of tensors with the same shapes and contents as the input
+Returns tensors with the same shapes and contents as the input
 tensors.
     )Doc");
 
@@ -347,6 +357,7 @@ REGISTER_OP("RefCall")
     .Output("output: Ref(T)")
     .Attr("T: type")
     .Attr("frame_name: string")
+    .Attr("is_constant: bool = false")
     .Attr("parallel_calls: int = 10")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"Doc(
@@ -362,7 +373,7 @@ frame_name: The name of the child frame.
 parallel_calls: The number of recursive calls allowed to run in parallel.
 output: The same tensor as `data`.
 
-Returns a list of tensors with the same shapes and contents as the input
+Returns tensors with the same shapes and contents as the input
 tensors.
     )Doc");
 
@@ -371,15 +382,8 @@ tensors.
 REGISTER_OP("Return")
 .Input("data: T")
 .Output("output: T")
-.Attr("T: list(type)")
-.SetShapeFn([](shape_inference::InferenceContext* c) {
-
-std::vector<ShapeHandle> input;
-TF_RETURN_IF_ERROR(c->input("data", &input));
-TF_RETURN_IF_ERROR(c->set_output("output", input));
-
-return Status::OK();
-})
+.Attr("T: type")
+.SetShapeFn(shape_inference::UnchangedShape)
 .Doc(R"Doc(
 Exits the current frame to its parent frame.
 Exit makes its input `data` available to the parent frame.
@@ -395,41 +399,8 @@ REGISTER_OP("RefReturn")
 .Doc(R"Doc(
 Exits the current frame to its parent frame.
 Exit makes its input `data` available to the parent frame.
-data: The list of tensors to be made available to the parent frame.
-output: The same list of tensors as `data`.
-    )Doc");
-
-// --------------------------------------------------------------------------
-
-REGISTER_OP("NextCall")
-.Input("data: T")
-.Output("output: T")
-.Attr("T: list(type)")
-.SetShapeFn([](shape_inference::InferenceContext* c) {
-
-std::vector<ShapeHandle> input;
-TF_RETURN_IF_ERROR(c->input("data", &input));
-TF_RETURN_IF_ERROR(c->set_output("output", input));
-
-return Status::OK();
-})
-.Doc(R"Doc(
-Makes its input available to the next iteration.
-
-data: The list of tensors to be made available to the next iteration.
-output: The same list of tensors as `data`.
-    )Doc");
-
-REGISTER_OP("RefNextCall")
-.Input("data: Ref(T)")
-.Output("output: Ref(T)")
-.Attr("T: type")
-.SetShapeFn(shape_inference::UnchangedShape)
-.Doc(R"Doc(
-Makes its input available to the next iteration.
-
-data: The list of tensors to be made available to the next iteration.
-output: The same list of tensors as `data`.
+data: The tensors to be made available to the parent frame.
+output: The same tensors as `data`.
     )Doc");
 
 

From 7113382d0d83d166dc8cab5fd7e8a5a07921d2dd Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Tue, 26 Jun 2018 01:05:30 +0300
Subject: [PATCH 08/64] Call/Return kernels

---
 .../core/common_runtime/graph_optimizer.cc    |  26 +--
 .../optimizers/function_transformation.cc     |  14 +-
 tensorflow/core/kernels/control_flow_ops.cc   | 182 ++++++++++++++++++
 tensorflow/core/kernels/control_flow_ops.h    |  28 +++
 tensorflow/core/ops/control_flow_ops.cc       |   2 +
 5 files changed, 238 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index ff99db9532..f4ce2ae6e7 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -58,18 +58,20 @@ void GraphOptimizer::Optimize(
       changed = true;
     }
 
-    if (opts_.do_constant_folding()) {
-      ConstantFoldingOptions cf_opts;
-      cf_opts.shape_map = shape_map;
-      bool was_mutated;
-      ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
-          .IgnoreError();
-      if (was_mutated) {
-        RemoveDeadNodes(g);
-        DumpGraph("ConstFolding", g);
-        changed = true;
-      }
-    }
+//    RemoveIdentityNodes(g);
+//    changed = true;
+//    if (opts_.do_constant_folding()) {
+//      ConstantFoldingOptions cf_opts;
+//      cf_opts.shape_map = shape_map;
+//      bool was_mutated;
+//      ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
+//          .IgnoreError();
+//      if (was_mutated) {
+//        RemoveDeadNodes(g);
+//        DumpGraph("ConstFolding", g);
+//        changed = true;
+//      }
+//    }
 
     if (opts_.do_function_inlining() && FixupSourceAndSinkEdges(g)) {
       DumpGraph("FixupSourceAndSinkEdges", g);
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index e65649b576..ea2c68aba4 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -171,6 +171,10 @@ namespace tensorflow {
               call->add_input(func_node.input(i));
               TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
               (*call->mutable_attr())["T"].set_type(type);
+              (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+              (*call->mutable_attr())["is_constant"].set_b(false);
+//              (*call->mutable_attr())["parallel_calls"].set_i(10);
+
 
 
               NodeDef* merge = argmerge_map[arg.name()];
@@ -188,6 +192,8 @@ namespace tensorflow {
               ret->add_input(strings::StrCat(func_node.op(), "/", functions_in[func_node.op()].fetch[i]));
               TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
               (*ret->mutable_attr())["T"].set_type(type);
+              (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+
             }
 
             return Status::OK();
@@ -228,6 +234,9 @@ namespace tensorflow {
               call->add_input(func_node.input(i));
               TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
               (*call->mutable_attr())["T"].set_type(type);
+              (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+              (*call->mutable_attr())["is_constant"].set_b(false);
+//              (*call->mutable_attr())["parallel_calls"].set_i(10);
 
               // Create and add a temporary merge node (IdentityN) for every input arg
               NodeDef* merge = optimized_graph->add_node();
@@ -325,6 +334,7 @@ namespace tensorflow {
               ret->add_input(strings::StrCat(func_node.name(), "/", input));
               TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
               (*ret->mutable_attr())["T"].set_type(type);
+              (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
             }
 
             // Break IdentityN Merges into multiple common Merge ops
@@ -579,7 +589,7 @@ namespace tensorflow {
           *optimized_graph->mutable_library() = item.graph.library();
 
 
-          /******************************************************************************************************
+          /******************************************************************************************************/
           // Dumps optimized graph in a not so readable form
           const GraphDef* tmp = optimized_graph;
           printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
@@ -600,7 +610,7 @@ namespace tensorflow {
           const void* bf = buf;
           event.set_graph_def(bf, proto_size);
           writer.WriteEvent(event);
-          ******************************************************************************************************/
+          /******************************************************************************************************/
 
           return Status::OK();
         }
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 64c06786bc..d30eddb5c8 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -587,6 +587,188 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
+
+
+
+
+/*************************************************************************************************/
+
+    void CallOp::Compute(OpKernelContext* context) {
+      if (IsRefType(context->input_dtype(0))) {
+        context->forward_ref_input_to_ref_output(0, 0);
+      } else {
+        context->set_output(0, context->input(0));
+      }
+    }
+
+    REGISTER_KERNEL_BUILDER(Name("Call").Device(DEVICE_CPU), CallOp);
+    REGISTER_KERNEL_BUILDER(Name("RefCall").Device(DEVICE_CPU), CallOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(        \
+      Name("Call").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
+#define REGISTER_GPU_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(            \
+      Name("RefCall").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
+
+    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+    REGISTER_GPU_KERNEL(bool);
+    REGISTER_GPU_REF_KERNEL(bool);
+
+#undef REGISTER_GPU_KERNEL
+#undef REGISTER_GPU_REF_KERNEL
+
+#ifdef TENSORFLOW_USE_SYCL
+    #define REGISTER_SYCL_KERNEL(type)  \
+  REGISTER_KERNEL_BUILDER(          \
+      Name("Call").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
+REGISTER_SYCL_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_REF_KERNEL(type)  \
+  REGISTER_KERNEL_BUILDER(              \
+      Name("RefCall").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
+REGISTER_SYCL_REF_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_REF_KERNEL
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Call")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefCall")                \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
+#define REGISTER_GPU_HOST_KERNEL(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("Call")                   \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+#define REGISTER_GPU_HOST_REF_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("RefCall")                \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+    REGISTER_GPU_HOST_KERNEL(int32);
+    REGISTER_GPU_HOST_REF_KERNEL(int32);
+    REGISTER_GPU_HOST_KERNEL(string);
+    REGISTER_GPU_HOST_REF_KERNEL(string);
+    REGISTER_GPU_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_GPU_HOST_KERNEL
+#undef REGISTER_GPU_HOST_REF_KERNEL
+
+    void ReturnOp::Compute(OpKernelContext* context) {
+      if (IsRefType(context->input_dtype(0))) {
+        context->forward_ref_input_to_ref_output(0, 0);
+      } else {
+        context->set_output(0, context->input(0));
+      }
+    }
+
+    REGISTER_KERNEL_BUILDER(Name("Return").Device(DEVICE_CPU), ReturnOp);
+    REGISTER_KERNEL_BUILDER(Name("RefReturn").Device(DEVICE_CPU), ReturnOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(        \
+      Name("Return").Device(DEVICE_GPU).TypeConstraint<type>("T"), ReturnOp);
+#define REGISTER_GPU_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(            \
+      Name("RefReturn").Device(DEVICE_GPU).TypeConstraint<type>("T"), ReturnOp);
+
+    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+    REGISTER_GPU_KERNEL(bool);
+    REGISTER_GPU_REF_KERNEL(bool);
+
+#undef REGISTER_GPU_KERNEL
+#undef REGISTER_GPU_REF_KERNEL
+
+#ifdef TENSORFLOW_USE_SYCL
+    #define REGISTER_SYCL_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Return").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ReturnOp);   \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RefReturn").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ReturnOp);
+REGISTER_SYCL_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_REF_KERNEL
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Return")                    \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("RefReturn")                 \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
+#define REGISTER_GPU_HOST_KERNEL(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("Return")                    \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("RefReturn")                 \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp)
+
+    REGISTER_GPU_HOST_KERNEL(int32);
+    REGISTER_GPU_HOST_KERNEL(string);
+
+#undef REGISTER_GPU_HOST_KERNEL
+
+/*************************************************************************************************/
+
+
+
+
+
+
+
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
 // determines loop termination. As a contract, any high-level front-end
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 4838f2e2bf..42ec64fb0f 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -97,6 +97,34 @@ class NextIterationOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
 };
 
+/**************************************************************************/
+// A call op has one input and one output. It creates or finds
+// the child frame that is uniquely identified by the frame_name,
+// and makes its input available to the child frame.
+class CallOp : public OpKernel {
+public:
+    explicit CallOp(OpKernelConstruction* context) : OpKernel(context) {}
+    void Compute(OpKernelContext* context) override;
+    bool IsExpensive() override { return false; }
+    ~CallOp() override {}
+
+    TF_DISALLOW_COPY_AND_ASSIGN(CallOp);
+};
+
+// A Return op has one input and one output. It exits the current
+// frame to its parent frame, and makes its input available to the
+// parent frame only if it receives a tensor with a specific tag.
+class ReturnOp : public OpKernel {
+public:
+    explicit ReturnOp(OpKernelConstruction* context) : OpKernel(context) {}
+    void Compute(OpKernelContext* context) override;
+    bool IsExpensive() override { return false; }
+    ~ReturnOp() override {}
+
+    TF_DISALLOW_COPY_AND_ASSIGN(ReturnOp);
+};
+
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index a1f4f37ed9..1b38f4c3e5 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -383,6 +383,7 @@ REGISTER_OP("Return")
 .Input("data: T")
 .Output("output: T")
 .Attr("T: type")
+.Attr("frame_name: string")
 .SetShapeFn(shape_inference::UnchangedShape)
 .Doc(R"Doc(
 Exits the current frame to its parent frame.
@@ -395,6 +396,7 @@ REGISTER_OP("RefReturn")
 .Input("data: Ref(T)")
 .Output("output: Ref(T)")
 .Attr("T: type")
+.Attr("frame_name: string")
 .SetShapeFn(shape_inference::UnchangedShape)
 .Doc(R"Doc(
 Exits the current frame to its parent frame.

From fc1114af35f6c4087d83c9c79bd5451ef9eb0bca Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Sat, 30 Jun 2018 03:55:19 +0300
Subject: [PATCH 09/64] Common Runtime -ongoing

-Temporarily disabled constant folding optimizations
 and Topological Sort, for they are causing mini problems and need
 special handling.

-
---
 TESTS/factorial.py                            |   8 +-
 TESTS/fib.py                                  |   4 +-
 TESTS/func.py                                 |   2 +
 tensorflow/core/common_runtime/executor.cc    | 214 +++++++++++++++---
 .../core/common_runtime/graph_optimizer.cc    |   2 -
 tensorflow/core/graph/graph.h                 |   2 +-
 .../optimizers/function_transformation.cc     |  24 +-
 .../grappler/optimizers/meta_optimizer.cc     |   8 +-
 tensorflow/core/ops/control_flow_ops.cc       |   8 -
 9 files changed, 201 insertions(+), 71 deletions(-)

diff --git a/TESTS/factorial.py b/TESTS/factorial.py
index e2a3dda6ae..25542860f9 100644
--- a/TESTS/factorial.py
+++ b/TESTS/factorial.py
@@ -17,11 +17,13 @@ def FacImpl(n):
 result = fac(x)
 y = tf.add(result, 1)
 
-#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+#print(tf.get_default_graph().as_graph_def())
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
 sess = tf.Session()
-print(sess.run(y, feed_dict={n: 3}))
+print(sess.run(y, feed_dict={n: 5}))
 
-#writer.close()
+writer.close()
 
 sess.close()
diff --git a/TESTS/fib.py b/TESTS/fib.py
index 8e5c330070..c6b4e4e9c1 100644
--- a/TESTS/fib.py
+++ b/TESTS/fib.py
@@ -16,14 +16,14 @@ def FibImpl(n):
 
 res = tf.add(x, 1)
 
-#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
 sess = tf.Session()
 
 #print(tf.get_default_graph().as_graph_def())
 
 
-#writer.close()
+writer.close()
 print(sess.run(res, feed_dict={n: 5}))
 
 sess.close()
diff --git a/TESTS/func.py b/TESTS/func.py
index 6ceec896ae..7466491f33 100644
--- a/TESTS/func.py
+++ b/TESTS/func.py
@@ -18,6 +18,8 @@ def MyFunc(x, y):
 
 x = tf.add(c, d)
 
+#print(tf.get_default_graph().as_graph_def())
+
 writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
 with tf.Session() as sess:			# no need to manually close the session
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index b1537eab01..a39d7c5834 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -216,10 +216,14 @@ struct NodeItem {
   bool is_merge : 1;             // True iff IsMerge(node)
   bool is_enter : 1;             // True iff IsEnter(node)
   bool is_exit : 1;              // True iff IsExit(node)
+  bool is_call : 1;             // True iff IsCall(node)
+  bool is_return : 1;              // True iff IsReturn(node)
   bool is_control_trigger : 1;   // True iff IsControlTrigger(node)
   bool is_sink : 1;              // True iff IsSink(node)
   // True iff IsEnter(node) || IsExit(node) || IsNextIteration(node)
   bool is_enter_exit_or_next_iter : 1;
+  // True iff IsCall(node) || IsReturn(node)
+  bool is_call_or_return : 1;
 
   // Cached values of node->num_inputs() and node->num_outputs(), to
   // avoid levels of indirection.
@@ -396,8 +400,8 @@ class ExecutorImpl : public Executor {
     }
   };
 
-  static Status BuildControlFlowInfo(const Graph* graph,
-                                     ControlFlowInfo* cf_info);
+  static Status BuildControlFlowInfo(const Graph* graph, ControlFlowInfo* cf_info,
+                                     std::unordered_map<string, std::set<string>>& synonym_frames);
   void InitializePending(const Graph* graph, const ControlFlowInfo& cf_info);
 
   FrameInfo* EnsureFrameInfo(const string& fname) {
@@ -605,9 +609,11 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 Status ExecutorImpl::Initialize() {
   gview_.Initialize(graph_);
 
+  std::unordered_map<string, std::set<string>> synonym_frames;
+
   // Build the information about frames in this subgraph.
   ControlFlowInfo cf_info;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_, &cf_info));
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_, &cf_info, synonym_frames));
 
   // Cache this value so we make this virtual function call once, rather
   // that O(# steps * # nodes per step) times.
@@ -649,10 +655,12 @@ Status ExecutorImpl::Initialize() {
     item->is_merge = IsMerge(n);
     item->is_enter = IsEnter(n);
     item->is_exit = IsExit(n);
+    item->is_call = IsCall(n);
+    item->is_return = IsReturn(n);
     item->is_control_trigger = IsControlTrigger(n);
     item->is_sink = IsSink(n);
-    item->is_enter_exit_or_next_iter =
-        (IsEnter(n) || IsExit(n) || IsNextIteration(n));
+    item->is_enter_exit_or_next_iter = (IsEnter(n) || IsExit(n) || IsNextIteration(n));
+    item->is_call_or_return = (IsCall(n) || IsReturn(n));
 
     // Compute the maximum values we'll store for this node in the
     // pending counts data structure, and allocate a handle in
@@ -660,12 +668,11 @@ Status ExecutorImpl::Initialize() {
     // space to store these maximal count values.
     size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
-    item->pending_id =
-        frame_info->pending_counts_layout.CreateHandle(max_pending, max_dead);
+    item->pending_id = frame_info->pending_counts_layout.CreateHandle(max_pending, max_dead);
 
     // Initialize static information about the frames in the graph.
     frame_info->nodes->push_back(n);
-    if (IsEnter(n)) {
+    if (IsEnter(n) || IsCall(n)) {
       string enter_name;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &enter_name));
       EnsureFrameInfo(enter_name)->input_count++;
@@ -676,6 +683,23 @@ Status ExecutorImpl::Initialize() {
   // all nodes.
   InitializePending(graph_, cf_info);
 
+  // Copy Synonym FrameInfos ------ is that necessary?
+  for (const auto& frame : synonym_frames)  {
+    FrameInfo* copyFrom = EnsureFrameInfo(frame.first);
+    for (const auto& syn : frame.second) {
+      FrameInfo* frame_info = EnsureFrameInfo(syn);
+      // Copy FrameInfo
+      frame_info->total_inputs = copyFrom->total_inputs;
+      frame_info->input_count = copyFrom->input_count;
+      frame_info->pending_counts_layout = copyFrom->pending_counts_layout;
+      frame_info->pending_counts = new PendingCounts(*copyFrom->pending_counts);
+      frame_info->nodes = new std::vector<const Node*>;
+      for (const Node* n : *copyFrom->nodes) {
+        frame_info->nodes->push_back(n);
+      }
+    }
+
+  }
   return gview_.SetAllocAttrs(graph_, params_.device);
 }
 
@@ -1329,8 +1353,8 @@ ExecutorState::~ExecutorState() {
   delete slice_reader_cache_;
 }
 
-Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
-                                          ControlFlowInfo* cf_info) {
+Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_info,
+                                          std::unordered_map<string, std::set<string>>& synonym_frames) {
   const int num_nodes = g->num_node_ids();
   cf_info->frame_names.resize(num_nodes);
   std::vector<Node*> parent_nodes;
@@ -1358,15 +1382,63 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
     Node* parent = nullptr;
     if (IsEnter(curr_node)) {
       // Enter a child frame.
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
+      TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       parent = curr_node;
-    } else if (IsExit(curr_node)) {
+    }
+
+    else if (IsCall(curr_node)) {
+      TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
+
+      int out_id;
+      for (const Edge* out_edge : curr_node->out_edges()) {     // Remove for loop and grab the only actual output of the call node
+        out_id = out_edge->dst()->id();
+        break;
+      }
+
+      // Not a recursive call
+      if (!visited[out_id]) {
+        // Enter a child frame.
+        parent = curr_node;
+        // If not already in map, add it as a new key
+        if (synonym_frames.find(frame_name) == synonym_frames.end()) {
+          std::set <string> synonyms;
+          synonyms.clear();
+          synonym_frames.emplace(frame_name, synonyms); // std::move()
+        }
+      }
+      // Recursive call : either from within the same function or from another one
+      else {
+        // It's just a synonym frame
+        parent = parent_nodes[curr_id];
+        synonym_frames[cf_info->frame_names[out_id]].emplace(frame_name);
+        frame_name = cf_info->frame_names[curr_id];
+      }
+    }
+
+    else if (IsExit(curr_node)) {
       // Exit to the parent frame.
       parent = parent_nodes[curr_id];
       frame_name = cf_info->frame_names[parent->id()];
       parent = parent_nodes[parent->id()];
-    } else {
+    }
+
+    else if (IsReturn(curr_node)) {
+
+      TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
+      // If frame_name exists as a key in the map and not as a synonym
+      if (synonym_frames.find(frame_name) != synonym_frames.end()) {
+        // Exit to the parent frame.
+        parent = parent_nodes[curr_id];
+        frame_name = cf_info->frame_names[parent->id()];
+        parent = parent_nodes[parent->id()];
+      }
+      else {
+        parent = parent_nodes[curr_id];
+        frame_name = cf_info->frame_names[curr_id];
+      }
+    }
+
+    else {
       parent = parent_nodes[curr_id];
       frame_name = cf_info->frame_names[curr_id];
     }
@@ -1530,7 +1602,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
     const int id = node->id();
     const NodeItem& item = *gview.node(id);
 
-    // TODO(misard) Replace with a finer-grain enabling flag once we
     // add better optional debugging support.
     if (vlog_ && VLOG_IS_ON(1)) {
       mutex_lock l(input_frame->mu);
@@ -1904,15 +1975,17 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
   FrameState* output_frame = input_frame;
   int64 output_iter = input_iter;
 
-  if (!item->is_enter_exit_or_next_iter) {
+  if (!item->is_enter_exit_or_next_iter && !item->is_call_or_return) {
     // Fast path for nodes types that don't need special handling
     DCHECK_EQ(input_frame, output_frame);
     // Normal path for most nodes
     mutex_lock l(input_frame->mu);
     output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-    is_frame_done = input_frame->DecrementOutstandingOpsLocked(
-        &impl_->gview_, input_iter, ready);
-  } else if (item->is_enter) {
+    is_frame_done = input_frame->DecrementOutstandingOpsLocked(&impl_->gview_, input_iter, ready);
+  }
+
+  else if (item->is_enter) {
+
     bool is_constant;
     const Status s = GetNodeAttr(node->attrs(), "is_constant", &is_constant);
     DCHECK(s.ok()) << s;
@@ -1929,9 +2002,42 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       }
       output_frame->num_pending_inputs--;
     }
-    is_frame_done =
-        input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
-  } else if (item->is_exit) {
+    is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
+  }
+
+  else if (item->is_call) {
+
+    FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
+    output_iter = 0;
+    {
+      const NodeItem* item = impl_->gview_.node(node->id());
+      mutex_lock l(output_frame->mu);
+      output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+      output_frame->num_pending_inputs--;
+    }
+
+    is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
+  }
+
+  else if (item->is_return) {
+    if (is_dead) {
+      // Stop the deadness propagation.
+      output_frame = nullptr;
+    }
+    else {
+
+      output_frame = input_frame->parent_frame;
+      output_iter = input_frame->parent_iter;
+      {
+        mutex_lock l(output_frame->mu);
+        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+      }
+    }
+    is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
+  }
+
+
+  else if (item->is_exit) {
     if (is_dead) {
       mutex_lock l(input_frame->mu);
       // Stop and remember this node if it is a dead exit.
@@ -1940,17 +2046,21 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       }
       is_frame_done = input_frame->DecrementOutstandingOpsLocked(
           &impl_->gview_, input_iter, ready);
-    } else {
+    }
+
+
+    else {
       output_frame = input_frame->parent_frame;
       output_iter = input_frame->parent_iter;
       {
         mutex_lock l(output_frame->mu);
         output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
       }
-      is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_,
-                                                           input_iter, ready);
+      is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
     }
-  } else {
+  }
+
+  else {
     DCHECK(IsNextIteration(node));
     mutex_lock l(input_frame->mu);
     if (is_dead) {
@@ -1972,7 +2082,7 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       }
     }
     if (output_frame != nullptr) {
-      // This is the case when node is not Enter, Exit, or NextIteration.
+      // This is the case when node is not Enter, Exit, NextIteration, Call or Return.
       DCHECK(input_frame == output_frame);
       output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
     }
@@ -2257,9 +2367,13 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   // Note that this new frame instance is created without any locks.
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
-  int parallel_iters;
-  s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
-  DCHECK(s.ok()) << s;
+  int parallel_iters = 1;
+  if (node->op_def().name() != "Call") {
+    printf("Yep its a call\n");
+    s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
+    DCHECK(s.ok()) << s;
+  }
+
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
   temp->frame_id = Hash64(child_name);
@@ -2270,8 +2384,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   // 'iterations' is a fixed-length circular buffer.
   temp->iterations.resize(temp->max_parallel_iterations + 1);
   // Initialize iteration 0.
-  temp->iterations[0] =
-      new IterationState(temp->pending_counts, temp->total_input_tensors);
+  temp->iterations[0] = new IterationState(temp->pending_counts, temp->total_input_tensors);
 
   {
     mutex_lock executor_lock(mu_);
@@ -2304,7 +2417,6 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
         const auto dst_pending_id =
             impl_->gview_.node(dst_node->id())->pending_id;
 
-        // TODO(yuanbyu): We don't need this if we require the subgraph
         // given to an executor not to contain a sink node.
         if (dst_node->IsSink()) continue;
 
@@ -2379,6 +2491,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
   const size_t num_output_edges = item->num_output_edges;
   const EdgeInfo* edges = item->output_edge_list();
   Entry* input_tensors = iter_state->input_tensors;
+
   for (size_t out_index = 0; out_index < num_output_edges; out_index++) {
     const EdgeInfo& e = edges[out_index];
     const int dst_id = e.dst_id;
@@ -2386,7 +2499,6 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
     const PendingCounts::Handle dst_pending_id = dst_item->pending_id;
     const int src_slot = e.output_slot;
 
-    // TODO(yuanbyu): We don't need this if we require the subgraph
     // given to an executor not to contain a sink node.
     if (dst_item->is_sink) continue;
 
@@ -2397,6 +2509,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
     // analysis happy.
     const bool is_control_edge = (src_slot == Graph::kControlSlot);
     bool dst_need_input = !is_control_edge;
+
     if (dst_item->is_merge) {
       // A merge node is ready if all control inputs have arrived and either
       // a) a live data input becomes available or b) all data inputs are
@@ -2435,11 +2548,38 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
         }
       }
     } else {
+
+      // In case of "Return" dst_node,
+      // we compare node's frame attr  with current frame name
+      // if they are different, propagate as dead
+      bool wrong_ret = 0;
+      if (dst_item->is_return) {
+
+        string frameName;
+        GetNodeAttr(dst_item->node->attrs(), "frame_name", &frameName);
+        frameName = strings::StrCat(parent_frame->frame_name, ";0;", frameName);
+
+        wrong_ret = !(frameName == frame_name);
+
+//        printf("parent_frame_appended: %s\n", frameName.c_str());
+//        printf("frame name: %s\n", frame_name.c_str());
+
+        printf("%s: Is it the right return? %d \n", dst_item->node->name().c_str(), (frameName == frame_name));
+        printf("%s: Is return's input dead? %d \n", dst_item->node->name().c_str(), is_dead);
+        printf("%s: Has return input's value? %d \n", dst_item->node->name().c_str(),(*outputs)[src_slot].has_value);
+      }
+
+      else {
+        printf("%s: Is  input dead? %d \n",dst_item->node->name().c_str(), is_dead);
+        printf("%s: Has input value? %d \n", dst_item->node->name().c_str(),(*outputs)[src_slot].has_value);
+      }
+
       const bool increment_dead =
-          (is_dead || (!is_control_edge && !(*outputs)[src_slot].has_value));
+              (is_dead || (!is_control_edge && !(*outputs)[src_slot].has_value) || wrong_ret);
+
+
       int pending, dead;
-      iter_state->adjust_for_activation(dst_pending_id, increment_dead,
-                                        &pending, &dead);
+      iter_state->adjust_for_activation(dst_pending_id, increment_dead, &pending, &dead);
       dst_dead = (dead > 0);
       dst_ready = (pending == 0);
     }
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index f4ce2ae6e7..d2c4ae455c 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -58,8 +58,6 @@ void GraphOptimizer::Optimize(
       changed = true;
     }
 
-//    RemoveIdentityNodes(g);
-//    changed = true;
 //    if (opts_.do_constant_folding()) {
 //      ConstantFoldingOptions cf_opts;
 //      cf_opts.shape_map = shape_map;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index e3a72f6acf..9f5906fd39 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -651,7 +651,7 @@ inline bool IsEnter(const Node* node) { return node->IsEnter(); }
 inline bool IsExit(const Node* node) { return node->IsExit(); }
 inline bool IsNextIteration(const Node* n) { return n->IsNextIteration(); }
 inline bool IsCall(const Node* node) { return node->IsCall(); }
-inline bool IsReturn(const Node* node) { return node->IsCall(); }
+inline bool IsReturn(const Node* node) { return node->IsReturn(); }
 inline bool IsLoopCond(const Node* node) { return node->IsLoopCond(); }
 inline bool IsControlTrigger(const Node* n) { return n->IsControlTrigger(); }
 inline bool IsSend(const Node* node) { return node->IsSend(); }
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index ea2c68aba4..b6dfdbb4f4 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -125,7 +125,7 @@ namespace tensorflow {
               res = input + "/Ret0";
             }
 
-            std::cout << res << std::endl;
+//            std::cout << res << std::endl;
 
             return res;
           }
@@ -165,7 +165,7 @@ namespace tensorflow {
 
               // Create and add in graph a Call node for every input arg
               NodeDef *call = optimized_graph->add_node();
-              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", arg.name()));
+              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
               call->set_op("Call");
               call->set_device(func_node.device());
               call->add_input(func_node.input(i));
@@ -173,9 +173,6 @@ namespace tensorflow {
               (*call->mutable_attr())["T"].set_type(type);
               (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
               (*call->mutable_attr())["is_constant"].set_b(false);
-//              (*call->mutable_attr())["parallel_calls"].set_i(10);
-
-
 
               NodeDef* merge = argmerge_map[arg.name()];
               merge->add_input(call->name());
@@ -228,7 +225,7 @@ namespace tensorflow {
 
               // Create and add in graph a Call node for every input arg
               NodeDef* call = optimized_graph->add_node();
-              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", arg.name()));
+              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
               call->set_op("Call");
               call->set_device(func_node.device());
               call->add_input(func_node.input(i));
@@ -236,11 +233,10 @@ namespace tensorflow {
               (*call->mutable_attr())["T"].set_type(type);
               (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
               (*call->mutable_attr())["is_constant"].set_b(false);
-//              (*call->mutable_attr())["parallel_calls"].set_i(10);
 
               // Create and add a temporary merge node (IdentityN) for every input arg
               NodeDef* merge = optimized_graph->add_node();
-              merge->set_name(strings::StrCat(func_node.name(), "/", "Merge_", arg.name()));
+              merge->set_name(strings::StrCat(func_node.name(), "/", "Merge_", i));
               merge->set_op("IdentityN");
               merge->set_device(func_node.device());
               merge->add_input(call->name());
@@ -273,14 +269,14 @@ namespace tensorflow {
                   }
                   input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
                 }
-                /*
-                // If the node has no input, make hook it up to the func_inputs node to
-                // ensure it runs in the same frame as the other nodes of the function
-                // body.
+
+                // If the node has no input, make hook it up to the Merge nodes to ensure
+                // it runs in the same frame as the other nodes of the function body.
                 if (func_body_node.input_size() == 0) {
-                  *func_body_node.add_input() = AsControlDependency(func_inputs->name());
+                  for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it) {
+                    *func_body_node.add_input() = AsControlDependency(it->second->name());
+                  }
                 }
-                 */
               }
 
               // Add the node name as a prefix to avoid collisions after inlining
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 037678e3c0..c12bd554f6 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -66,10 +66,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!cfg_.disable_model_pruning()) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
-    if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
-    }
+//    if (cfg_.constant_folding() != RewriterConfig::OFF) {
+//      optimizers.push_back(
+//          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
+//    }
     if (cfg_.function_transformation() != RewriterConfig::OFF) {
       optimizers.push_back(
               std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 1b38f4c3e5..63f8676e78 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -317,7 +317,6 @@ REGISTER_OP("Call")
     .Attr("T: type")
     .Attr("frame_name: string")
     .Attr("is_constant: bool = false")
-    .Attr("parallel_calls: int = 10")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
 
@@ -340,12 +339,9 @@ Creates (or finds) a child frame, and makes `data` available to the child frame.
 
 This op is used together with `Return` to create recursive calls in the graph.
 The unique `frame_name` is used by the `Executor` to identify frames.
-At most `parallel_calls` recursive calls
-are run in parallel in the child frame.
 
 data: The tensor to be made available to the child frame.
 frame_name: The name of the child frame.
-parallel_calls: The number of recursive calls allowed to run in parallel.
 output: The same tensor as `data`.
 
 Returns tensors with the same shapes and contents as the input
@@ -358,19 +354,15 @@ REGISTER_OP("RefCall")
     .Attr("T: type")
     .Attr("frame_name: string")
     .Attr("is_constant: bool = false")
-    .Attr("parallel_calls: int = 10")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"Doc(
 Creates (or finds) a child frame, and makes `data` available to the child frame.
 
 This op is used together with `Return` to create recursive calls in the graph.
 The unique `frame_name` is used by the `Executor` to identify frames.
-At most `parallel_calls` recursive calls
-are run in parallel in the child frame.
 
 data: The tensor to be made available to the child frame.
 frame_name: The name of the child frame.
-parallel_calls: The number of recursive calls allowed to run in parallel.
 output: The same tensor as `data`.
 
 Returns tensors with the same shapes and contents as the input

From 2658ed60ea19064dbfd365957ca76a6057259c7f Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Mon, 2 Jul 2018 06:19:02 +0300
Subject: [PATCH 10/64] Fixed Executor for mutually recursive functions

---
 tensorflow/core/common_runtime/executor.cc    | 82 +++++++++++--------
 .../common_runtime/graph_execution_state.cc   |  2 -
 tensorflow/core/graph/graph_constructor.cc    |  2 +-
 .../grappler/optimizers/meta_optimizer.cc     |  8 +-
 4 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index a39d7c5834..60f15378bf 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1355,6 +1355,9 @@ ExecutorState::~ExecutorState() {
 
 Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_info,
                                           std::unordered_map<string, std::set<string>>& synonym_frames) {
+
+  std::unordered_map<string, int> synframeToCall;
+
   const int num_nodes = g->num_node_ids();
   cf_info->frame_names.resize(num_nodes);
   std::vector<Node*> parent_nodes;
@@ -1380,6 +1383,7 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
     ready.pop_front();
 
     Node* parent = nullptr;
+
     if (IsEnter(curr_node)) {
       // Enter a child frame.
       TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
@@ -1390,7 +1394,8 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
       TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
 
       int out_id;
-      for (const Edge* out_edge : curr_node->out_edges()) {     // Remove for loop and grab the only actual output of the call node
+      // Remove for loop and grab the only actual output of the call node
+      for (const Edge* out_edge : curr_node->out_edges()) {
         out_id = out_edge->dst()->id();
         break;
       }
@@ -1409,9 +1414,9 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
       // Recursive call : either from within the same function or from another one
       else {
         // It's just a synonym frame
-        parent = parent_nodes[curr_id];
-        synonym_frames[cf_info->frame_names[out_id]].emplace(frame_name);
-        frame_name = cf_info->frame_names[curr_id];
+        if (synonym_frames[cf_info->frame_names[out_id]].emplace(frame_name).second == true) {
+          synframeToCall.emplace(frame_name, curr_id);
+        }
       }
     }
 
@@ -1425,17 +1430,33 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
     else if (IsReturn(curr_node)) {
 
       TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
-      // If frame_name exists as a key in the map and not as a synonym
-      if (synonym_frames.find(frame_name) != synonym_frames.end()) {
+
+      // node corresponds to a recursive call
+      if (synonym_frames.find(frame_name) == synonym_frames.end()) {
+
+
+        std::unordered_map<std::string,int>::const_iterator it = synframeToCall.find(frame_name);
+        if (it != synframeToCall.end()) {
+          // we don't trust parent_nodes[curr_id] and cf_info->frame_names[curr_id]
+          // values that were set by the predecessor as they might be wrong in
+          // case of mutually recursive functions
+          int call_id = it->second;
+          parent = parent_nodes[call_id];
+          frame_name = cf_info->frame_names[call_id];
+        }
+        else {
+          // node corresponds to a recursive call we have not already encountered
+          // Insert back in queue so it will be processed again after synonym frame is created
+          ready.push_back(curr_node);
+          continue;
+        }
+      }
+      else {
         // Exit to the parent frame.
         parent = parent_nodes[curr_id];
         frame_name = cf_info->frame_names[parent->id()];
         parent = parent_nodes[parent->id()];
       }
-      else {
-        parent = parent_nodes[curr_id];
-        frame_name = cf_info->frame_names[curr_id];
-      }
     }
 
     else {
@@ -1460,7 +1481,6 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
       }
     }
   }
-
   return Status::OK();
 }
 
@@ -1593,7 +1613,9 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
   EntryVector outputs;
   bool completed = false;
   inline_ready.push_back(tagged_node);
+
   while (!inline_ready.empty()) {
+
     tagged_node = inline_ready.front();
     inline_ready.pop_front();
     const Node* node = tagged_node.node;
@@ -2007,15 +2029,20 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
 
   else if (item->is_call) {
 
-    FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
-    output_iter = 0;
-    {
-      const NodeItem* item = impl_->gview_.node(node->id());
-      mutex_lock l(output_frame->mu);
-      output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-      output_frame->num_pending_inputs--;
+    if (is_dead) {
+      // Stop the deadness propagation.
+      output_frame = nullptr;
+    }
+    else {
+      FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
+      output_iter = 0;
+      {
+        const NodeItem *item = impl_->gview_.node(node->id());
+        mutex_lock l(output_frame->mu);
+        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+        output_frame->num_pending_inputs--;
+      }
     }
-
     is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
   }
 
@@ -2025,7 +2052,6 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       output_frame = nullptr;
     }
     else {
-
       output_frame = input_frame->parent_frame;
       output_iter = input_frame->parent_iter;
       {
@@ -2093,6 +2119,7 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
   // At this point, this node is completely done. We also know if the
   // completion of this node makes its frame completed.
   if (is_frame_done) {
+
     FrameState* parent_frame = input_frame->parent_frame;
     const int64 parent_iter = input_frame->parent_iter;
     DeleteFrame(input_frame, ready);
@@ -2369,7 +2396,6 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
 
   int parallel_iters = 1;
   if (node->op_def().name() != "Call") {
-    printf("Yep its a call\n");
     s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
     DCHECK(s.ok()) << s;
   }
@@ -2559,19 +2585,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
         GetNodeAttr(dst_item->node->attrs(), "frame_name", &frameName);
         frameName = strings::StrCat(parent_frame->frame_name, ";0;", frameName);
 
-        wrong_ret = !(frameName == frame_name);
-
-//        printf("parent_frame_appended: %s\n", frameName.c_str());
-//        printf("frame name: %s\n", frame_name.c_str());
-
-        printf("%s: Is it the right return? %d \n", dst_item->node->name().c_str(), (frameName == frame_name));
-        printf("%s: Is return's input dead? %d \n", dst_item->node->name().c_str(), is_dead);
-        printf("%s: Has return input's value? %d \n", dst_item->node->name().c_str(),(*outputs)[src_slot].has_value);
-      }
-
-      else {
-        printf("%s: Is  input dead? %d \n",dst_item->node->name().c_str(), is_dead);
-        printf("%s: Has input value? %d \n", dst_item->node->name().c_str(),(*outputs)[src_slot].has_value);
+        wrong_ret = (frameName != frame_name);
       }
 
       const bool increment_dead =
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index d6660161da..ee8b1d919e 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -380,8 +380,6 @@ Status GraphExecutionState::OptimizeGraph(
     const void* bf = buf;
     event.set_graph_def(bf, proto_size);
     writer.WriteEvent(event);
-
-    printf(" Test\n");
 /*******************************************************************************************/
 
 
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index b5efa49926..f3334c93e2 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -837,7 +837,7 @@ Status GraphConstructor::Convert() {
     if (has_data_back_edge && !IsMerge(*node_def) && !IsReturn(*node_def)) {
       return errors::InvalidArgument(
           "Node '", node_def->name(),
-          "' had a back edge, but only Merge nodes can have back edges.");
+          "' had a back edge, but only Merge and Return nodes can have back edges.");
     }
 
     Node* node;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index c12bd554f6..037678e3c0 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -66,10 +66,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!cfg_.disable_model_pruning()) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
-//    if (cfg_.constant_folding() != RewriterConfig::OFF) {
-//      optimizers.push_back(
-//          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
-//    }
+    if (cfg_.constant_folding() != RewriterConfig::OFF) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
+    }
     if (cfg_.function_transformation() != RewriterConfig::OFF) {
       optimizers.push_back(
               std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));

From 87738e15eca1b6b1a601b19c2b8dc65341a0c0e1 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Mon, 2 Jul 2018 06:23:46 +0300
Subject: [PATCH 11/64] Fixed bug in transformation - if a fetch_output
 corresponded to a function-calling node that   transformation altered, then
 fetch was invalid

---
 .../optimizers/function_transformation.cc     | 219 +++++-------------
 1 file changed, 59 insertions(+), 160 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index b6dfdbb4f4..260885612f 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/util/event.pb.h"
 #include "tensorflow/core/util/events_writer.h"
 
-
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -110,6 +110,26 @@ namespace tensorflow {
             return Status::OK();
           }
 
+          // Copy input/output argument type to the type_list. Return error if argument
+          // type is not explicitly defined, and not specified in function attributes.
+          Status CopyArgTypeN(const NodeDef& func_node,
+                             const std::unordered_map<string, AttrValue>& func_attr,
+                             const string& arg_kind, const OpDef::ArgDef& arg,
+                             AttrValue::ListValue* type_list) {
+            if (arg.type() != DT_INVALID) {
+              type_list->add_type(arg.type());
+            } else {
+              auto it = func_attr.find(arg.type_attr());
+              if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+                return errors::InvalidArgument(
+                        "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+                        func_node.op(), " instantiated by ", func_node.name());
+              }
+              type_list->add_type(it->second.type());
+            }
+            return Status::OK();
+          }
+
           string ParseString(string input) {
 
             size_t pos = 0;
@@ -210,9 +230,6 @@ namespace tensorflow {
             std::set<string> foutputs;
             GatherOutputs(foutputs, *item, ctx);
 
-//std::cout << foutputs.size() << '\n';
-//for( const auto& str : foutputs ) std::cout << str << '\n';
-
             DataType type;
             std::unordered_map<string, int> input_nodes;
             functions_in[func_node.op()].fetch = item->fetch;
@@ -333,7 +350,7 @@ namespace tensorflow {
               (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
             }
 
-            // Break IdentityN Merges into multiple common Merge ops
+            // Break IdentityN Merges into multiple common Binary Merge ops
             int j=0;
             for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
 
@@ -385,151 +402,7 @@ namespace tensorflow {
 
             return Status::OK();
           }
-/*
-            class FakeCPUDevice : public Device {
-            public:
-                FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
-                Status Sync() override { return Status::OK(); }
-            };
-
-            class SymbolicGradientEnv {
-            public:
-                SymbolicGradientEnv(int graph_version, const FunctionDefLibrary& library)
-                        : graph_version_(graph_version), library_(library) {}
-
-                FunctionLibraryDefinition* function_library() {
-                  InitializeIfNeeded();
-                  return fld_.get();
-                }
-                FunctionLibraryRuntime* function_library_runtime() {
-                  InitializeIfNeeded();
-                  return flr_;
-                }
-
-            private:
-                // This initialization is expensive. Do it lazily to avoid paying for it
-                // unless it's needed.
-                void InitializeIfNeeded() {
-                  if (flr_) {
-                    return;
-                  }
-                  Env* env = Env::Default();
-                  DeviceAttributes attr;
-                  attr.set_name("/device:CPU:0");
-                  attr.set_device_type("CPU");
-                  FakeCPUDevice* dev = new FakeCPUDevice(env, attr);
-                  std::vector<Device*> devices;
-                  devices.push_back(dev);
-                  dvc_mgr_.reset(new DeviceMgr(devices));
-                  fld_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), library_));
-                  OptimizerOptions optimizer_opts;
-                  optimizer_opts.set_do_function_inlining(true);
-                  pflr_.reset(new ProcessFunctionLibraryRuntime(
-                          dvc_mgr_.get(), env, graph_version_, fld_.get(), optimizer_opts));
-                  flr_ = pflr_->GetFLR(dev->name());
-                }
-
-                const int graph_version_;
-                const FunctionDefLibrary& library_;
-                std::unique_ptr<DeviceMgr> dvc_mgr_;
-                std::unique_ptr<FunctionLibraryDefinition> fld_;
-                std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-                FunctionLibraryRuntime* flr_ = nullptr;
-            };
-
-            Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
-                                          GraphDef* inlined_graph)
-            {
-              GraphDef graph_def;
-
-              // Create a node to anchor the gradient inputs
-              NodeDef* inlined_input = graph_def.add_node();
-              inlined_input->set_name("FunctionInputs");
-              inlined_input->set_op("IdentityN");
-              AttrValue::ListValue* type_list =
-                      (*inlined_input->mutable_attr())["T"].mutable_list();
-              for (const auto& type : node.attr().at("Tin").list().type()) {
-                type_list->add_type(static_cast<DataType>(type));
-              }
 
-              // Add the gradient node
-              NodeDef* inlined = graph_def.add_node();
-              *inlined = node;
-              inlined->clear_input();
-              for (int i = 0; i < node.attr().at("Tin").list().type_size(); ++i) {
-                inlined->add_input(strings::StrCat(inlined_input->name(), ":", i));
-              }
-
-              // Create a node to anchor the gradient outputs
-              NodeDef* inlined_output = graph_def.add_node();
-              inlined_output->set_name("FunctionOutputs");
-              inlined_output->set_op("IdentityN");
-              type_list = (*inlined_output->mutable_attr())["T"].mutable_list();
-              for (const auto& type : node.attr().at("Tout").list().type()) {
-                type_list->add_type(static_cast<DataType>(type));
-              }
-              for (int i = 0; i < node.attr().at("Tout").list().type_size(); ++i) {
-                inlined_output->add_input(strings::StrCat(inlined->name(), ":", i));
-              }
-
-              // Convert the graphdef to a graph
-              GraphConstructorOptions graph_ctor_opts;
-              graph_ctor_opts.allow_internal_ops = true;
-              graph_ctor_opts.expect_device_spec = false;
-              Graph graph(env->function_library());
-              TF_RETURN_IF_ERROR(
-                      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
-
-              // Recursively inline the functions until there is nothing more to inline. We
-              // should at least expand one function.
-              int counter = 0;
-              while (counter < 50 &&
-                     ExpandInlineFunctions(env->function_library_runtime(), &graph)) {
-                ++counter;
-              }
-
-              GraphDef inlined_graph_def;
-              graph.ToGraphDef(&inlined_graph_def);
-
-              // Add the default values of attributes to the nodes that have been inlined.
-              TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&inlined_graph_def, *graph.op_registry(), 0, true));
-
-              // Add the inlined nodes to the graph
-              for (NodeDef& inlined_node : *inlined_graph_def.mutable_node()) {
-                if (inlined_node.name() == "FunctionOutputs") {
-                  inlined_node.set_name(node.name());
-                  for (int i = 0; i < inlined_node.input_size(); ++i) {
-                    inlined_node.set_input(
-                            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
-                  }
-                } else if (inlined_node.name() == "FunctionInputs") {
-                  inlined_node.set_name(
-                          AddPrefixToNodeName(inlined_node.name(), node.name()));
-                  inlined_node.clear_input();
-                  for (int i = 0; i < node.input_size(); ++i) {
-                    inlined_node.add_input(node.input(i));
-                  }
-                } else {
-                  inlined_node.set_name(
-                          AddPrefixToNodeName(inlined_node.name(), node.name()));
-                  for (int i = 0; i < inlined_node.input_size(); ++i) {
-                    inlined_node.set_input(
-                            i, AddPrefixToNodeName(inlined_node.input(i), node.name()));
-                  }
-                  // If the node has no input, hook it up to the function input node to make
-                  // sure it runs in the same frame as the other nodes of the function body.
-                  if (inlined_node.input_size() == 0) {
-                    *inlined_node.add_input() = AsControlDependency(
-                            AddPrefixToNodeName("FunctionInputs", node.name()));
-                  }
-                }
-                inlined_node.set_device(node.device());
-                inlined_graph->add_node()->Swap(&inlined_node);
-              }
-
-              return Status::OK();
-            }
-*/
         }  // namespace
 
 
@@ -550,19 +423,11 @@ namespace tensorflow {
             return Status::OK();
           }
 
-//          SymbolicGradientEnv env(item.graph.versions().producer(),item.graph.library());
-
           std::unordered_map<string, FuncInfo> functions_in;
 
           // Copying node cause I need to make changes on it
           for (NodeDef node : item.graph.node()) {
-//            if (node.op() == "SymbolicGradient") {
-//              TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
-//              continue;
-//            }
-
             for (string& input : *node.mutable_input()) {
-
               // If it takes input from a function
               if (foutputs.find(input) != foutputs.end()) {
                 input = ParseString(input);
@@ -575,6 +440,39 @@ namespace tensorflow {
               functions_in.emplace(node.op(), func_info);
               InlineFunction(node, *func, function_inlining_ctx, optimized_graph, functions_in);
               functions_in.erase(node.op());      // At this point functions_in will be empty
+
+              // Check if the function node corresponded to some fetch_outputs
+              // before transformation occurred
+              NodeDef *idN;
+              bool created = false;
+              const std::unordered_map<string, AttrValue> func_attr(node.attr().begin(), node.attr().end());
+
+              for (size_t i = 0; i < item.fetch.size(); ++i) {
+                const string &t = item.fetch[i];
+                // Parse t into node_name and output_index.
+                TensorId id(ParseTensorName(t));
+
+                if (node.name() == id.first) {
+
+                  if (created == false) {
+                    idN = optimized_graph->add_node();
+                    idN->set_op("IdentityN");
+                    idN->set_name(node.name());
+                    idN->set_device(node.device());
+
+                    AttrValue::ListValue* type_list = (*idN->mutable_attr())["T"].mutable_list();
+                    for (const OpDef::ArgDef& arg : func->signature().output_arg()) {
+                      TF_RETURN_IF_ERROR(CopyArgTypeN(node, func_attr, "input", arg, type_list));
+                    }
+
+                    idN->add_input(strings::StrCat(node.name(), "/Ret", id.second));
+
+                    created = true;
+                  } else {
+                    idN->add_input(strings::StrCat(node.name(), "/Ret", id.second));
+                  }
+                }
+              }
             }
             else {
               *optimized_graph->add_node() = node;
@@ -585,13 +483,14 @@ namespace tensorflow {
           *optimized_graph->mutable_library() = item.graph.library();
 
 
+
           /******************************************************************************************************/
           // Dumps optimized graph in a not so readable form
-          const GraphDef* tmp = optimized_graph;
-          printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
+//          const GraphDef* tmp = optimized_graph;
+//          printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
 
           // Write an event, so that we can visualize this optimized graph in tensorboard
-          EventsWriter writer("INLINE");
+          EventsWriter writer("TRANSFORMATION");
           Event event;
           event.set_wall_time(1234);
           event.set_step(34);

From 37ffe2619d94577dcacac170732f132ff9974fa5 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 14:54:44 +0300
Subject: [PATCH 12/64] Remove unnecessary changes to files

Remove unnecessary edits to files to make clearer the changes
that matter.
---
 .../common_runtime/graph_execution_state.cc   |  6 +--
 tensorflow/core/graph/graph_constructor.cc    | 54 ++++++++-----------
 .../grappler/optimizers/meta_optimizer.cc     |  7 ++-
 tensorflow/core/kernels/control_flow_ops.cc   | 11 ----
 tensorflow/core/ops/control_flow_ops.cc       | 37 ++++++-------
 5 files changed, 44 insertions(+), 71 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index ee8b1d919e..228cd66208 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -362,7 +362,9 @@ Status GraphExecutionState::OptimizeGraph(
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     optimized_graph->reset(new Graph(OpRegistry::Global()));
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
+    TF_RETURN_IF_ERROR(
+        ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
+
 /*******************************************************************************************/
     // Write an event, so that we can visualize this optimized graph in tensorboard
     EventsWriter writer("Fully_Optimized");
@@ -381,8 +383,6 @@ Status GraphExecutionState::OptimizeGraph(
     event.set_graph_def(bf, proto_size);
     writer.WriteEvent(event);
 /*******************************************************************************************/
-
-
     // The graph conversion sets the requested device names but not the assigned
     // device names. However, since at this point the graph is placed TF expects
     // an assigned device name for every node. Therefore we copy the requested
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index f3334c93e2..5e9bd16d01 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -181,12 +181,15 @@ class GraphConstructor {
   // input_already_exists is a pre-initialized vector of length
   // node_def->input_size(). This function will mark inputs that are remapped to
   // true.
-  void RemapNodeDefInputs(NodeDef* node_def, std::vector<bool>* input_already_exists);
+  void RemapNodeDefInputs(NodeDef* node_def,
+                          std::vector<bool>* input_already_exists);
   // input_already_exists is a pre-initialized vector of length
   // node_def->input_size(). This function will add and mark control inputs as
   // true.
-  void AddControlDependencies(NodeDef* node_def, std::vector<bool>* input_already_exists);
-  void AddPrefixToNodeDef(const std::vector<bool>& input_already_exists, NodeDef* node_def);
+  void AddControlDependencies(NodeDef* node_def,
+                              std::vector<bool>* input_already_exists);
+  void AddPrefixToNodeDef(const std::vector<bool>& input_already_exists,
+                          NodeDef* node_def);
 
   // From constructor
   const Options opts_;
@@ -253,7 +256,6 @@ class GraphConstructor {
   std::vector<EdgeInfo> back_edges_;
 };
 
-
 // This could be expensive but we don't expect to call it often, if at all (only
 // if there are multiple nodes in g_ with the same name)
 bool NodeNameInValues(const std::map<TensorId, TensorId>& input_map,
@@ -413,24 +415,21 @@ Status GraphConstructor::InitFromEdges() {
     const NodeDef& node_def = *node_defs_[n];
 
     if (IsMerge(node_def)) {
-      // Cycles in the graph are only allowed for while loops and recursion.
-      // A while loop is identified by an edge from a NextIteration node to a Merge node.
+      // Cycles in the graph are only allowed for while loops and recursion. A while loop is
+      // identified by an edge from a NextIteration node to a Merge node.
       // A recursion is identified by an edge from a NextCall Node to a Merge node
       // For such Merge nodes, only wait for one non-control input before
       // considering the node ready to process in Convert().
       int32 num_control_edges = 0;
       bool has_loop_back_edge = false;
-
       for (int i = 0; i < node_def.input_size(); ++i) {
-
         StringPiece input_name(node_def.input(i));
-
         if (input_name.starts_with("^")) {
           num_control_edges++;
-        }
-        else {
+        } else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_call_nodes_.find(id.first.ToString()) != next_iteration_call_nodes_.end()) {
+          if (next_iteration_call_nodes_.find(id.first.ToString()) !=
+              next_iteration_call_nodes_.end()) {
             has_loop_back_edge = true;
           }
         }
@@ -440,10 +439,8 @@ Status GraphConstructor::InitFromEdges() {
       } else {
         pending_count_.push_back(node_def.input_size());
       }
-    }
-
-    // Does not necessarily mean cycle though - maybe I should find a better condition
-    else if (IsReturn(node_def)) {
+    } else if (IsReturn(node_def)) {
+      // Does not necessarily mean cycle though - maybe I should find a better condition
       int32 num_control_edges = 0;
 
       for (int i = 0; i < node_def.input_size(); ++i) {
@@ -457,18 +454,13 @@ Status GraphConstructor::InitFromEdges() {
 
       pending_count_.push_back(num_control_edges);
       ready_.push_back(n);
-    }
-
-    else {
+    } else {
       pending_count_.push_back(node_def.input_size());
     }
-
-
     if (node_def.input_size() == 0) {
       ready_.push_back(n);
       continue;
     }
-
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
       TensorId id(ParseTensorName(input_name));
@@ -505,7 +497,7 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
   // Add the node to the graph.
   Status status;
   *node = g_->AddNode(node_def, &status);
-  if (!status.ok()) {return status;}
+  if (!status.ok()) return status;
   if (opts_.expect_device_spec) {
     (*node)->set_assigned_device_name(node_def.device());
   }
@@ -787,6 +779,10 @@ Status GraphConstructor::Convert() {
         }
       }
 
+      // TODO(ashankar): The line below means an additional copy of the NodeDef,
+      // which can be expensive if the NodeDef contains large tensors in it.
+      // Might make sense to change the API for ImportGraphDef to take a mutable
+      // GraphDef* and avoid the copying.
       imported_node_def = original_node_def;
       if (!opts_.input_map.empty()) {
         // Note that input_already_exists can shrink here
@@ -797,8 +793,7 @@ Status GraphConstructor::Convert() {
         AddControlDependencies(&imported_node_def, &input_already_exists);
       }
       node_def = &imported_node_def;
-    }
-    else {
+    } else {
       node_def = &original_node_def;
     }
 
@@ -863,6 +858,8 @@ Status GraphConstructor::Convert() {
       }
     }
 
+    // TODO(skyewm): remove conditional when b/35715995 ("Functions lack shape
+    // inference") is resolved.
     if (g_->flib_def().Find(node_def->name()) == nullptr) {
       TF_RETURN_IF_ERROR(ValidateShape(node));
     }
@@ -875,7 +872,6 @@ Status GraphConstructor::Convert() {
     return errors::InvalidArgument(node_defs_.size() - processed,
                                    " nodes in a cycle");
   }
-
   return Status::OK();
 }
 
@@ -976,9 +972,6 @@ Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
 
 }  // namespace
 
-
-
-
 Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                               const GraphDef& gdef, Graph* g) {
   ShapeRefiner refiner(gdef.versions().producer(), g->op_registry());
@@ -986,9 +979,6 @@ Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                                      &gdef.library(), g, &refiner, nullptr);
 }
 
-
-
-
 Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
                               gtl::ArraySlice<NodeDef> nodes, Graph* g) {
   ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, g->op_registry());
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 037678e3c0..f3318c0584 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -100,7 +100,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   } else {
     std::set<string> available_optimizers = {"pruning",      "constfold",
-											                        "function_transformation",
+                                             "function_transformation",
                                              "layout",       "memory",
                                              "autoparallel", "arithmetic"};
     for (const auto& optimizer : cfg_.optimizers()) {
@@ -145,9 +145,8 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 }
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  return !cfg.disable_model_pruning() ||
-		 cfg.function_transformation() != RewriterConfig::OFF ||
-		 cfg.optimize_tensor_layout() ||
+  return !cfg.disable_model_pruning() || cfg.optimize_tensor_layout() ||
+         cfg.function_transformation() != RewriterConfig::OFF ||
          cfg.constant_folding() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
          cfg.auto_parallel().enable() || cfg.memory_optimization() > 1 ||
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index d30eddb5c8..04c84768ee 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -587,10 +587,6 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
-
-
-
-
 /*************************************************************************************************/
 
     void CallOp::Compute(OpKernelContext* context) {
@@ -760,15 +756,8 @@ REGISTER_SYCL_HOST_KERNEL(string);
     REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
-
 /*************************************************************************************************/
 
-
-
-
-
-
-
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
 // determines loop termination. As a contract, any high-level front-end
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 63f8676e78..9daf77206c 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -403,26 +403,26 @@ REGISTER_OP("LoopCond")
     .Input("input: bool")
     .Output("output: bool")
     .SetShapeFn([](InferenceContext* c) {
-    return shape_inference::UnchangedShapeWithRank(c, 0);
+     return shape_inference::UnchangedShapeWithRank(c, 0);
     })
     .Doc(R"doc(
-    Forwards the input to the output.
+Forwards the input to the output.
 
-    This operator represents the loop termination condition used by the
-    "pivot" switches of a loop.
+This operator represents the loop termination condition used by the
+"pivot" switches of a loop.
 
-    input: A boolean scalar, representing the branch predicate of the Switch op.
-    output: The same tensor as `input`.
-    )doc");
+input: A boolean scalar, representing the branch predicate of the Switch op.
+output: The same tensor as `input`.
+)doc");
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ControlTrigger")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"docstring(
-    Does nothing. Serves as a control trigger for scheduling.
+Does nothing. Serves as a control trigger for scheduling.
 
-    Only useful as a placeholder for control edges.
-    )docstring");
+Only useful as a placeholder for control edges.
+)docstring");
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Abort")
@@ -430,20 +430,15 @@ REGISTER_OP("Abort")
     .Attr("exit_without_error: bool = false")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
-    Raise a exception to abort the process when called.
-
-    If exit_without_error is true, the process will exit normally,
-    otherwise it will exit with a SIGABORT signal.
-
-    Returns nothing but an exception.
-
-    error_msg: A string which is the message associated with the exception.
-    )doc");
-
-
+Raise a exception to abort the process when called.
 
+If exit_without_error is true, the process will exit normally,
+otherwise it will exit with a SIGABORT signal.
 
+Returns nothing but an exception.
 
+error_msg: A string which is the message associated with the exception.
+)doc");
 
 
 

From d113ee52c45c966a17abefd980f18c86976b8141 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 15:11:37 +0300
Subject: [PATCH 13/64] Remove (more) unnecessary edits

---
 tensorflow/core/graph/graph_constructor.cc    | 19 +++++--------------
 .../grappler/optimizers/meta_optimizer.cc     |  4 ++--
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 5e9bd16d01..c56c13e78e 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -60,8 +60,7 @@ inline bool IsCall(const NodeDef& node_def) {
 inline bool IsReturn(const NodeDef& node_def) {
       return node_def.op() == "Return" ||
              node_def.op() == "RefReturn";
-    }
-
+}
 
 bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
   using ::tensorflow::strings::Scanner;
@@ -212,7 +211,6 @@ class GraphConstructor {
     int gdef_index;
     Node* node;  // nullptr until the NodeDef is converted to a Node.
   };
-
   // TODO(vrv): Profile this data structure to see if we should use an
   // alternative implementation of std::unordered_map.
   std::unordered_map<StringPiece, NodeInfo, StringPiece::Hasher> gdef_nodes_;
@@ -402,21 +400,19 @@ std::unordered_set<string> GetNextIterationCallNodes(
   return next_iteration_call_nodes;
 }
 
-
-
 Status GraphConstructor::InitFromEdges() {
   const int num_nodes = node_defs_.size();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
-  std::unordered_set<string> next_iteration_call_nodes_ = GetNextIterationCallNodes(node_defs_);
+  std::unordered_set<string> next_iteration_call_nodes_ =
+      GetNextIterationCallNodes(node_defs_);
 
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
     const NodeDef& node_def = *node_defs_[n];
-
     if (IsMerge(node_def)) {
-      // Cycles in the graph are only allowed for while loops and recursion. A while loop is
-      // identified by an edge from a NextIteration node to a Merge node.
+      // Cycles in the graph are only allowed for while loops and recursion.
+      // A while loop is identified by an edge from a NextIteration node to a Merge node.
       // A recursion is identified by an edge from a NextCall Node to a Merge node
       // For such Merge nodes, only wait for one non-control input before
       // considering the node ready to process in Convert().
@@ -442,16 +438,12 @@ Status GraphConstructor::InitFromEdges() {
     } else if (IsReturn(node_def)) {
       // Does not necessarily mean cycle though - maybe I should find a better condition
       int32 num_control_edges = 0;
-
       for (int i = 0; i < node_def.input_size(); ++i) {
-
         StringPiece input_name(node_def.input(i));
-
         if (input_name.starts_with("^")) {
           num_control_edges++;
         }
       }
-
       pending_count_.push_back(num_control_edges);
       ready_.push_back(n);
     } else {
@@ -749,7 +741,6 @@ Status GraphConstructor::Convert() {
   // inputs, pending_counts_ with the number of inputs for each node and
   // outputs_ with the outputs of each node).
   while (!ready_.empty()) {
-
     int o = ready_.back();
     ready_.pop_back();
     ++processed;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index f3318c0584..acf5e258a7 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -38,7 +38,7 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     graph_optimizer.reset(new ModelPruner());
   }
   if (optimizer == "function_transformation") {
-	graph_optimizer.reset(new FunctionTransformation());
+	  graph_optimizer.reset(new FunctionTransformation());
   }
   if (optimizer == "constfold") {
     graph_optimizer.reset(new ConstantFolding(cpu_device_));
@@ -72,7 +72,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
     if (cfg_.function_transformation() != RewriterConfig::OFF) {
       optimizers.push_back(
-              std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));
+          std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));
     }
     if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(

From 5cdeeaf3f31098c1ea578cce99c252406018b2c5 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 15:31:45 +0300
Subject: [PATCH 14/64] Reformat and reident

---
 .../common_runtime/graph_execution_state.cc   |  2 -
 tensorflow/core/kernels/control_flow_ops.cc   | 69 +++++++++----------
 tensorflow/core/kernels/control_flow_ops.h    |  3 -
 tensorflow/core/ops/control_flow_ops.cc       |  7 +-
 4 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 228cd66208..e3fadc6a63 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -358,13 +358,11 @@ Status GraphExecutionState::OptimizeGraph(
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
         item, rewrite_options, cpu_device, &cluster, &new_graph));
-
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     optimized_graph->reset(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(
         ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
-
 /*******************************************************************************************/
     // Write an event, so that we can visualize this optimized graph in tensorboard
     EventsWriter writer("Fully_Optimized");
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 04c84768ee..32fc1f3659 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -588,17 +588,16 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #endif // TENSORFLOW_USE_SYCL
 
 /*************************************************************************************************/
+void CallOp::Compute(OpKernelContext* context) {
+  if (IsRefType(context->input_dtype(0))) {
+    context->forward_ref_input_to_ref_output(0, 0);
+  } else {
+    context->set_output(0, context->input(0));
+  }
+}
 
-    void CallOp::Compute(OpKernelContext* context) {
-      if (IsRefType(context->input_dtype(0))) {
-        context->forward_ref_input_to_ref_output(0, 0);
-      } else {
-        context->set_output(0, context->input(0));
-      }
-    }
-
-    REGISTER_KERNEL_BUILDER(Name("Call").Device(DEVICE_CPU), CallOp);
-    REGISTER_KERNEL_BUILDER(Name("RefCall").Device(DEVICE_CPU), CallOp);
+REGISTER_KERNEL_BUILDER(Name("Call").Device(DEVICE_CPU), CallOp);
+REGISTER_KERNEL_BUILDER(Name("RefCall").Device(DEVICE_CPU), CallOp);
 
 #define REGISTER_GPU_KERNEL(type) \
   REGISTER_KERNEL_BUILDER(        \
@@ -607,16 +606,16 @@ REGISTER_SYCL_HOST_KERNEL(string);
   REGISTER_KERNEL_BUILDER(            \
       Name("RefCall").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
 
-    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
-    REGISTER_GPU_KERNEL(bool);
-    REGISTER_GPU_REF_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_REF_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-    #define REGISTER_SYCL_KERNEL(type)  \
+#define REGISTER_SYCL_KERNEL(type)  \
   REGISTER_KERNEL_BUILDER(          \
       Name("Call").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
 REGISTER_SYCL_KERNEL(bool);
@@ -672,25 +671,25 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
                               .TypeConstraint<type>("T"), \
                           CallOp)
 
-    REGISTER_GPU_HOST_KERNEL(int32);
-    REGISTER_GPU_HOST_REF_KERNEL(int32);
-    REGISTER_GPU_HOST_KERNEL(string);
-    REGISTER_GPU_HOST_REF_KERNEL(string);
-    REGISTER_GPU_HOST_KERNEL(ResourceHandle);
+REGISTER_GPU_HOST_KERNEL(int32);
+REGISTER_GPU_HOST_REF_KERNEL(int32);
+REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_REF_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
-    void ReturnOp::Compute(OpKernelContext* context) {
-      if (IsRefType(context->input_dtype(0))) {
-        context->forward_ref_input_to_ref_output(0, 0);
-      } else {
-        context->set_output(0, context->input(0));
-      }
-    }
+void ReturnOp::Compute(OpKernelContext* context) {
+  if (IsRefType(context->input_dtype(0))) {
+    context->forward_ref_input_to_ref_output(0, 0);
+  } else {
+    context->set_output(0, context->input(0));
+  }
+}
 
-    REGISTER_KERNEL_BUILDER(Name("Return").Device(DEVICE_CPU), ReturnOp);
-    REGISTER_KERNEL_BUILDER(Name("RefReturn").Device(DEVICE_CPU), ReturnOp);
+REGISTER_KERNEL_BUILDER(Name("Return").Device(DEVICE_CPU), ReturnOp);
+REGISTER_KERNEL_BUILDER(Name("RefReturn").Device(DEVICE_CPU), ReturnOp);
 
 #define REGISTER_GPU_KERNEL(type) \
   REGISTER_KERNEL_BUILDER(        \
@@ -699,10 +698,10 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
   REGISTER_KERNEL_BUILDER(            \
       Name("RefReturn").Device(DEVICE_GPU).TypeConstraint<type>("T"), ReturnOp);
 
-    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-    TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
-    REGISTER_GPU_KERNEL(bool);
-    REGISTER_GPU_REF_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_REF_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -752,8 +751,8 @@ REGISTER_SYCL_HOST_KERNEL(string);
                               .TypeConstraint<type>("T"), \
                           ReturnOp)
 
-    REGISTER_GPU_HOST_KERNEL(int32);
-    REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(int32);
+REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
 /*************************************************************************************************/
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 42ec64fb0f..e8b6b826d9 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -97,7 +97,6 @@ class NextIterationOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
 };
 
-/**************************************************************************/
 // A call op has one input and one output. It creates or finds
 // the child frame that is uniquely identified by the frame_name,
 // and makes its input available to the child frame.
@@ -123,8 +122,6 @@ class ReturnOp : public OpKernel {
 
     TF_DISALLOW_COPY_AND_ASSIGN(ReturnOp);
 };
-
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 9daf77206c..145017b43a 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -310,7 +310,6 @@ output: The same tensor as `data`.
 )doc");
 
 // --------------------------------------------------------------------------
-
 REGISTER_OP("Call")
     .Input("data: T")
     .Output("output: T")
@@ -370,7 +369,6 @@ tensors.
     )Doc");
 
 // --------------------------------------------------------------------------
-
 REGISTER_OP("Return")
 .Input("data: T")
 .Output("output: T")
@@ -403,7 +401,7 @@ REGISTER_OP("LoopCond")
     .Input("input: bool")
     .Output("output: bool")
     .SetShapeFn([](InferenceContext* c) {
-     return shape_inference::UnchangedShapeWithRank(c, 0);
+      return shape_inference::UnchangedShapeWithRank(c, 0);
     })
     .Doc(R"doc(
 Forwards the input to the output.
@@ -440,7 +438,4 @@ Returns nothing but an exception.
 error_msg: A string which is the message associated with the exception.
 )doc");
 
-
-
-
 }  // namespace tensorflow

From 921a3b5356028ff4cb93ab95dcb319582aa6d0cf Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 15:34:03 +0300
Subject: [PATCH 15/64] Remove newline

---
 tensorflow/core/graph/graph_constructor.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index c56c13e78e..475ba9acbf 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -387,7 +387,6 @@ Status GraphConstructor::BuildNodeIndex() {
 
 std::unordered_set<string> GetNextIterationCallNodes(
     const GraphConstructor::NodeDefSlice& node_defs) {
-
   std::unordered_set<string> next_iteration_call_nodes;
 
   for (int n = 0; n < node_defs.size(); ++n) {

From 605033a2e1f13b483e55b08d9bd38f9a39012eb6 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 15:52:04 +0300
Subject: [PATCH 16/64] Reformat executor code

---
 tensorflow/core/common_runtime/executor.cc | 159 +++++++++------------
 1 file changed, 65 insertions(+), 94 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 60f15378bf..3d63c76f98 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -400,7 +400,8 @@ class ExecutorImpl : public Executor {
     }
   };
 
-  static Status BuildControlFlowInfo(const Graph* graph, ControlFlowInfo* cf_info,
+  static Status BuildControlFlowInfo(const Graph* graph,
+                                     ControlFlowInfo* cf_info,
                                      std::unordered_map<string, std::set<string>>& synonym_frames);
   void InitializePending(const Graph* graph, const ControlFlowInfo& cf_info);
 
@@ -659,8 +660,10 @@ Status ExecutorImpl::Initialize() {
     item->is_return = IsReturn(n);
     item->is_control_trigger = IsControlTrigger(n);
     item->is_sink = IsSink(n);
-    item->is_enter_exit_or_next_iter = (IsEnter(n) || IsExit(n) || IsNextIteration(n));
-    item->is_call_or_return = (IsCall(n) || IsReturn(n));
+    item->is_enter_exit_or_next_iter =
+        (IsEnter(n) || IsExit(n) || IsNextIteration(n));
+    item->is_call_or_return =
+        (IsCall(n) || IsReturn(n));
 
     // Compute the maximum values we'll store for this node in the
     // pending counts data structure, and allocate a handle in
@@ -668,7 +671,8 @@ Status ExecutorImpl::Initialize() {
     // space to store these maximal count values.
     size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
-    item->pending_id = frame_info->pending_counts_layout.CreateHandle(max_pending, max_dead);
+    item->pending_id =
+        frame_info->pending_counts_layout.CreateHandle(max_pending, max_dead);
 
     // Initialize static information about the frames in the graph.
     frame_info->nodes->push_back(n);
@@ -1353,9 +1357,9 @@ ExecutorState::~ExecutorState() {
   delete slice_reader_cache_;
 }
 
-Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_info,
+Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
+                                          ControlFlowInfo* cf_info,
                                           std::unordered_map<string, std::set<string>>& synonym_frames) {
-
   std::unordered_map<string, int> synframeToCall;
 
   const int num_nodes = g->num_node_ids();
@@ -1383,23 +1387,25 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
     ready.pop_front();
 
     Node* parent = nullptr;
-
     if (IsEnter(curr_node)) {
       // Enter a child frame.
-      TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       parent = curr_node;
-    }
-
-    else if (IsCall(curr_node)) {
-      TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
-
+    } else if (IsExit(curr_node)) {
+      // Exit to the parent frame.
+      parent = parent_nodes[curr_id];
+      frame_name = cf_info->frame_names[parent->id()];
+      parent = parent_nodes[parent->id()];
+    } else if (IsCall(curr_node)) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       int out_id;
       // Remove for loop and grab the only actual output of the call node
       for (const Edge* out_edge : curr_node->out_edges()) {
         out_id = out_edge->dst()->id();
         break;
       }
-
       // Not a recursive call
       if (!visited[out_id]) {
         // Enter a child frame.
@@ -1410,31 +1416,18 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
           synonyms.clear();
           synonym_frames.emplace(frame_name, synonyms); // std::move()
         }
-      }
-      // Recursive call : either from within the same function or from another one
-      else {
+      } else {
+        // Recursive call : either from within the same function or from another one
         // It's just a synonym frame
         if (synonym_frames[cf_info->frame_names[out_id]].emplace(frame_name).second == true) {
           synframeToCall.emplace(frame_name, curr_id);
         }
       }
-    }
-
-    else if (IsExit(curr_node)) {
-      // Exit to the parent frame.
-      parent = parent_nodes[curr_id];
-      frame_name = cf_info->frame_names[parent->id()];
-      parent = parent_nodes[parent->id()];
-    }
-
-    else if (IsReturn(curr_node)) {
-
-      TF_RETURN_IF_ERROR(GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
-
+    } else if (IsReturn(curr_node)) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       // node corresponds to a recursive call
       if (synonym_frames.find(frame_name) == synonym_frames.end()) {
-
-
         std::unordered_map<std::string,int>::const_iterator it = synframeToCall.find(frame_name);
         if (it != synframeToCall.end()) {
           // we don't trust parent_nodes[curr_id] and cf_info->frame_names[curr_id]
@@ -1443,23 +1436,19 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
           int call_id = it->second;
           parent = parent_nodes[call_id];
           frame_name = cf_info->frame_names[call_id];
-        }
-        else {
+        } else {
           // node corresponds to a recursive call we have not already encountered
           // Insert back in queue so it will be processed again after synonym frame is created
           ready.push_back(curr_node);
           continue;
         }
-      }
-      else {
+      } else {
         // Exit to the parent frame.
         parent = parent_nodes[curr_id];
         frame_name = cf_info->frame_names[parent->id()];
         parent = parent_nodes[parent->id()];
       }
-    }
-
-    else {
+    } else {
       parent = parent_nodes[curr_id];
       frame_name = cf_info->frame_names[curr_id];
     }
@@ -1481,6 +1470,7 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g, ControlFlowInfo* cf_in
       }
     }
   }
+
   return Status::OK();
 }
 
@@ -1613,9 +1603,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
   EntryVector outputs;
   bool completed = false;
   inline_ready.push_back(tagged_node);
-
   while (!inline_ready.empty()) {
-
     tagged_node = inline_ready.front();
     inline_ready.pop_front();
     const Node* node = tagged_node.node;
@@ -1624,6 +1612,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
     const int id = node->id();
     const NodeItem& item = *gview.node(id);
 
+    // TODO(misard) Replace with a finer-grain enabling flag once we
     // add better optional debugging support.
     if (vlog_ && VLOG_IS_ON(1)) {
       mutex_lock l(input_frame->mu);
@@ -2003,11 +1992,9 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
     // Normal path for most nodes
     mutex_lock l(input_frame->mu);
     output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-    is_frame_done = input_frame->DecrementOutstandingOpsLocked(&impl_->gview_, input_iter, ready);
-  }
-
-  else if (item->is_enter) {
-
+    is_frame_done = input_frame->DecrementOutstandingOpsLocked(
+        &impl_->gview_, input_iter, ready);
+  } else if (item->is_enter) {
     bool is_constant;
     const Status s = GetNodeAttr(node->attrs(), "is_constant", &is_constant);
     DCHECK(s.ok()) << s;
@@ -2024,16 +2011,31 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       }
       output_frame->num_pending_inputs--;
     }
-    is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
-  }
-
-  else if (item->is_call) {
-
+    is_frame_done =
+        input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
+  } else if (item->is_exit) {
+    if (is_dead) {
+      mutex_lock l(input_frame->mu);
+      // Stop and remember this node if it is a dead exit.
+      if (input_iter == input_frame->iteration_count) {
+        input_frame->dead_exits.push_back(node);
+      }
+      is_frame_done = input_frame->DecrementOutstandingOpsLocked(
+          &impl_->gview_, input_iter, ready);
+    } else {
+      output_frame = input_frame->parent_frame;
+      output_iter = input_frame->parent_iter;
+      {
+        mutex_lock l(output_frame->mu);
+        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+      }
+      is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
+    }
+  } else if (item->is_call) {
     if (is_dead) {
       // Stop the deadness propagation.
       output_frame = nullptr;
-    }
-    else {
+    } else {
       FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
       output_iter = 0;
       {
@@ -2044,14 +2046,11 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       }
     }
     is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
-  }
-
-  else if (item->is_return) {
+  } else if (item->is_return) {
     if (is_dead) {
       // Stop the deadness propagation.
       output_frame = nullptr;
-    }
-    else {
+    } else {
       output_frame = input_frame->parent_frame;
       output_iter = input_frame->parent_iter;
       {
@@ -2060,33 +2059,7 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
       }
     }
     is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
-  }
-
-
-  else if (item->is_exit) {
-    if (is_dead) {
-      mutex_lock l(input_frame->mu);
-      // Stop and remember this node if it is a dead exit.
-      if (input_iter == input_frame->iteration_count) {
-        input_frame->dead_exits.push_back(node);
-      }
-      is_frame_done = input_frame->DecrementOutstandingOpsLocked(
-          &impl_->gview_, input_iter, ready);
-    }
-
-
-    else {
-      output_frame = input_frame->parent_frame;
-      output_iter = input_frame->parent_iter;
-      {
-        mutex_lock l(output_frame->mu);
-        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-      }
-      is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
-    }
-  }
-
-  else {
+  } else {
     DCHECK(IsNextIteration(node));
     mutex_lock l(input_frame->mu);
     if (is_dead) {
@@ -2119,7 +2092,6 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
   // At this point, this node is completely done. We also know if the
   // completion of this node makes its frame completed.
   if (is_frame_done) {
-
     FrameState* parent_frame = input_frame->parent_frame;
     const int64 parent_iter = input_frame->parent_iter;
     DeleteFrame(input_frame, ready);
@@ -2410,7 +2382,8 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   // 'iterations' is a fixed-length circular buffer.
   temp->iterations.resize(temp->max_parallel_iterations + 1);
   // Initialize iteration 0.
-  temp->iterations[0] = new IterationState(temp->pending_counts, temp->total_input_tensors);
+  temp->iterations[0] =
+      new IterationState(temp->pending_counts, temp->total_input_tensors);
 
   {
     mutex_lock executor_lock(mu_);
@@ -2443,6 +2416,7 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
         const auto dst_pending_id =
             impl_->gview_.node(dst_node->id())->pending_id;
 
+        // TODO(yuanbyu): We don't need this if we require the subgraph
         // given to an executor not to contain a sink node.
         if (dst_node->IsSink()) continue;
 
@@ -2517,7 +2491,6 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
   const size_t num_output_edges = item->num_output_edges;
   const EdgeInfo* edges = item->output_edge_list();
   Entry* input_tensors = iter_state->input_tensors;
-
   for (size_t out_index = 0; out_index < num_output_edges; out_index++) {
     const EdgeInfo& e = edges[out_index];
     const int dst_id = e.dst_id;
@@ -2525,6 +2498,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
     const PendingCounts::Handle dst_pending_id = dst_item->pending_id;
     const int src_slot = e.output_slot;
 
+    // TODO(yuanbyu): We don't need this if we require the subgraph
     // given to an executor not to contain a sink node.
     if (dst_item->is_sink) continue;
 
@@ -2535,7 +2509,6 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
     // analysis happy.
     const bool is_control_edge = (src_slot == Graph::kControlSlot);
     bool dst_need_input = !is_control_edge;
-
     if (dst_item->is_merge) {
       // A merge node is ready if all control inputs have arrived and either
       // a) a live data input becomes available or b) all data inputs are
@@ -2574,7 +2547,6 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
         }
       }
     } else {
-
       // In case of "Return" dst_node,
       // we compare node's frame attr  with current frame name
       // if they are different, propagate as dead
@@ -2589,11 +2561,10 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
       }
 
       const bool increment_dead =
-              (is_dead || (!is_control_edge && !(*outputs)[src_slot].has_value) || wrong_ret);
-
-
+          (is_dead || (!is_control_edge && !(*outputs)[src_slot].has_value) || wrong_ret);
       int pending, dead;
-      iter_state->adjust_for_activation(dst_pending_id, increment_dead, &pending, &dead);
+      iter_state->adjust_for_activation(dst_pending_id, increment_dead,
+                                        &pending, &dead);
       dst_dead = (dead > 0);
       dst_ready = (pending == 0);
     }

From 3830c4bea13a4f46bfc5ec4b459c2bbe75f05902 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 15:56:46 +0300
Subject: [PATCH 17/64] Reformat executor code

---
 tensorflow/core/common_runtime/executor.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 3d63c76f98..6bae08804f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2029,7 +2029,8 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
         mutex_lock l(output_frame->mu);
         output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
       }
-      is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
+      is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_,
+                                                           input_iter, ready);
     }
   } else if (item->is_call) {
     if (is_dead) {

From 1b721a023ed83e2d1ca4cdee5611a062d3dc465b Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:00:35 +0300
Subject: [PATCH 18/64] Minor reformat in executor

---
 tensorflow/core/common_runtime/executor.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 6bae08804f..f90a19877d 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1361,7 +1361,6 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
                                           ControlFlowInfo* cf_info,
                                           std::unordered_map<string, std::set<string>>& synonym_frames) {
   std::unordered_map<string, int> synframeToCall;
-
   const int num_nodes = g->num_node_ids();
   cf_info->frame_names.resize(num_nodes);
   std::vector<Node*> parent_nodes;
@@ -2372,7 +2371,6 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
     s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
     DCHECK(s.ok()) << s;
   }
-
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
   temp->frame_id = Hash64(child_name);

From 8193c33b1ccb3cc8e1fe0fd9f68ffe94136a26cb Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:03:32 +0300
Subject: [PATCH 19/64] Minor in executor

---
 tensorflow/core/common_runtime/executor.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index f90a19877d..11476ab89c 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -608,9 +608,8 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 }
 
 Status ExecutorImpl::Initialize() {
-  gview_.Initialize(graph_);
-
   std::unordered_map<string, std::set<string>> synonym_frames;
+  gview_.Initialize(graph_);
 
   // Build the information about frames in this subgraph.
   ControlFlowInfo cf_info;

From c1a22d83139915d988dfe504429b3255b929bd0b Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:06:29 +0300
Subject: [PATCH 20/64] Set constant folding off instead of commenting out the
 code

---
 .../core/common_runtime/graph_optimizer.cc    | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index d2c4ae455c..7a303bfaa6 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -28,6 +28,8 @@ GraphOptimizer::GraphOptimizer(const OptimizerOptions& opts) : opts_(opts) {
   if (opts_.opt_level() >= OptimizerOptions::L1) {
     opts_.set_do_common_subexpression_elimination(true);
     opts_.set_do_constant_folding(true);
+    // set constant folding to false for now; don't know why..
+    opts_.set_do_constant_folding(false);
   }
 }
 
@@ -58,18 +60,18 @@ void GraphOptimizer::Optimize(
       changed = true;
     }
 
-//    if (opts_.do_constant_folding()) {
-//      ConstantFoldingOptions cf_opts;
-//      cf_opts.shape_map = shape_map;
-//      bool was_mutated;
-//      ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
-//          .IgnoreError();
-//      if (was_mutated) {
-//        RemoveDeadNodes(g);
-//        DumpGraph("ConstFolding", g);
-//        changed = true;
-//      }
-//    }
+   if (opts_.do_constant_folding()) {
+     ConstantFoldingOptions cf_opts;
+     cf_opts.shape_map = shape_map;
+     bool was_mutated;
+     ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
+         .IgnoreError();
+     if (was_mutated) {
+       RemoveDeadNodes(g);
+       DumpGraph("ConstFolding", g);
+       changed = true;
+     }
+   }
 
     if (opts_.do_function_inlining() && FixupSourceAndSinkEdges(g)) {
       DumpGraph("FixupSourceAndSinkEdges", g);

From 024eab110f215b5a4eb8445f4030935c04a5b88b Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:07:57 +0300
Subject: [PATCH 21/64] Missing indentation

---
 tensorflow/core/common_runtime/graph_optimizer.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 7a303bfaa6..2a1f3c3dad 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -60,7 +60,7 @@ void GraphOptimizer::Optimize(
       changed = true;
     }
 
-   if (opts_.do_constant_folding()) {
+    if (opts_.do_constant_folding()) {
      ConstantFoldingOptions cf_opts;
      cf_opts.shape_map = shape_map;
      bool was_mutated;
@@ -71,7 +71,7 @@ void GraphOptimizer::Optimize(
        DumpGraph("ConstFolding", g);
        changed = true;
      }
-   }
+    }
 
     if (opts_.do_function_inlining() && FixupSourceAndSinkEdges(g)) {
       DumpGraph("FixupSourceAndSinkEdges", g);

From 16783950c0b8722e48867f6280c14cad5e969963 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:09:01 +0300
Subject: [PATCH 22/64] Missing indentation

---
 tensorflow/core/common_runtime/graph_optimizer.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 2a1f3c3dad..335d559b8f 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -61,16 +61,16 @@ void GraphOptimizer::Optimize(
     }
 
     if (opts_.do_constant_folding()) {
-     ConstantFoldingOptions cf_opts;
-     cf_opts.shape_map = shape_map;
-     bool was_mutated;
-     ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
+      ConstantFoldingOptions cf_opts;
+      cf_opts.shape_map = shape_map;
+      bool was_mutated;
+      ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
          .IgnoreError();
-     if (was_mutated) {
+      if (was_mutated) {
        RemoveDeadNodes(g);
        DumpGraph("ConstFolding", g);
        changed = true;
-     }
+      }
     }
 
     if (opts_.do_function_inlining() && FixupSourceAndSinkEdges(g)) {

From bfd1183cee2ee4f2f8bd18615e692d051dc46787 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:09:56 +0300
Subject: [PATCH 23/64] Missing indentation

---
 tensorflow/core/common_runtime/graph_optimizer.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 335d559b8f..367b0315a0 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -65,11 +65,11 @@ void GraphOptimizer::Optimize(
       cf_opts.shape_map = shape_map;
       bool was_mutated;
       ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
-         .IgnoreError();
+          .IgnoreError();
       if (was_mutated) {
-       RemoveDeadNodes(g);
-       DumpGraph("ConstFolding", g);
-       changed = true;
+        RemoveDeadNodes(g);
+        DumpGraph("ConstFolding", g);
+        changed = true;
       }
     }
 

From 80111816f236974849cb5c93676db55bfcc18f13 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:27:39 +0300
Subject: [PATCH 24/64] Reindent function transformation; hopefully it compiles

---
 .../optimizers/function_transformation.cc     | 871 +++++++++---------
 .../optimizers/function_transformation.h      |  26 +-
 2 files changed, 438 insertions(+), 459 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 260885612f..588f70c404 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -34,487 +34,468 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/functions.h"
 
 namespace tensorflow {
-    namespace grappler {
-        namespace {
+namespace grappler {
+namespace {
 
-          class FunctionInliningContext {
-            public:
-                explicit FunctionInliningContext(const GrapplerItem& item)
-                        : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
+  class FunctionInliningContext {
+    public:
+        explicit FunctionInliningContext(const GrapplerItem& item)
+                : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
 
-                const FunctionDefLibrary& Library() const { return *library_; }
+        const FunctionDefLibrary& Library() const { return *library_; }
 
-                bool HasInlinedFunctions() const { return !functions_.empty(); }
+        bool HasInlinedFunctions() const { return !functions_.empty(); }
 
-                // Find inlining candidate by name. Return nullptr if not found.
-                const FunctionDef* FindInlinedFunction(const string& name) const {
-                  auto it = functions_.find(name);
-                  if (it != functions_.end()) {
-                    return it->second;
-                  } else {
-                    return nullptr;
-                  }
-                }
-
-            private:
-                std::unordered_map<string, const FunctionDef*> InliningCandidates(const GrapplerItem& item) const {
-
-                  std::unordered_map<string, const FunctionDef*> functions;
+        // Find inlining candidate by name. Return nullptr if not found.
+        const FunctionDef* FindInlinedFunction(const string& name) const {
+          auto it = functions_.find(name);
+          if (it != functions_.end()) {
+            return it->second;
+          } else {
+            return nullptr;
+          }
+        }
 
-                  for (const FunctionDef& func : item.graph.library().function()) {
-                    // Don't inline functions marked as noinline
+    private:
+        std::unordered_map<string, const FunctionDef*> InliningCandidates(const GrapplerItem& item) const {
+          std::unordered_map<string, const FunctionDef*> functions;
+          for (const FunctionDef& func : item.graph.library().function()) {
+            // Don't inline functions marked as noinline
 //                    if (func.attr().count("_noinline") != 0) {
 //                      continue;
 //                    }
-                    // Don't touch anything marked XLA to prevent XLA failures further down
-                    // the road.
-                    if (func.attr().count("_XlaCompile") > 0 &&
-                        func.attr().at("_XlaCompile").b()) {
-                      continue;
-                    }
-                    // Can't create IdentityN nodes with no input or output: skip these
-                    // functions for now.
-                    if (func.signature().input_arg_size() == 0 ||
-                        func.signature().output_arg_size() == 0) {
-                      continue;
-                    }
-                    functions[func.signature().name()] = &func;
-                  }
-                  return functions;
-                }
-
-                const FunctionDefLibrary* library_;
-                std::unordered_map<string, const FunctionDef*> functions_;
-
-                TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
-            };
-
-          // Copy input/output argument type to the type. Return error if argument
-          // type is not explicitly defined, and not specified in function attributes.
-          Status CopyArgType(const NodeDef& func_node,
-                             const std::unordered_map<string, AttrValue>& func_attr,
-                             const string& arg_kind, const OpDef::ArgDef& arg,
-                             DataType* type) {
-
-            if (arg.type() != DT_INVALID) {
-              *type = arg.type();
-            } else {
-              auto it = func_attr.find(arg.type_attr());
-              if (it == func_attr.end() || it->second.type() == DT_INVALID) {
-                return errors::InvalidArgument(
-                        "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
-                        func_node.op(), " instantiated by ", func_node.name());
-              }
-              *type = it->second.type();
+            // Don't touch anything marked XLA to prevent XLA failures further down
+            // the road.
+            if (func.attr().count("_XlaCompile") > 0 &&
+                func.attr().at("_XlaCompile").b()) {
+              continue;
             }
-            return Status::OK();
-          }
-
-          // Copy input/output argument type to the type_list. Return error if argument
-          // type is not explicitly defined, and not specified in function attributes.
-          Status CopyArgTypeN(const NodeDef& func_node,
-                             const std::unordered_map<string, AttrValue>& func_attr,
-                             const string& arg_kind, const OpDef::ArgDef& arg,
-                             AttrValue::ListValue* type_list) {
-            if (arg.type() != DT_INVALID) {
-              type_list->add_type(arg.type());
-            } else {
-              auto it = func_attr.find(arg.type_attr());
-              if (it == func_attr.end() || it->second.type() == DT_INVALID) {
-                return errors::InvalidArgument(
-                        "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
-                        func_node.op(), " instantiated by ", func_node.name());
-              }
-              type_list->add_type(it->second.type());
+            // Can't create IdentityN nodes with no input or output: skip these
+            // functions for now.
+            if (func.signature().input_arg_size() == 0 ||
+                func.signature().output_arg_size() == 0) {
+              continue;
             }
-            return Status::OK();
+            functions[func.signature().name()] = &func;
           }
+          return functions;
+        }
 
-          string ParseString(string input) {
-
-            size_t pos = 0;
-            std::string res = "";
-            std::string delimiter = ":";
-
-            if ((pos = input.find(delimiter)) != std::string::npos) {
-              res = res + input.substr(0, pos);
-              input.erase(0, pos + delimiter.length());
-              res = res + "/Ret" + input;
-            }
-            else {
-              res = input + "/Ret0";
-            }
-
-//            std::cout << res << std::endl;
-
-            return res;
-          }
-
-          Status GatherOutputs(std::set<string> &foutputs, const GrapplerItem& item,
-                               const FunctionInliningContext& function_inlining_ctx) {
-
-            for (const NodeDef& node : item.graph.node()) {
-
-              const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
-              if (func != nullptr) {      // If it's a function calling node
-
-                for (int i = 0; i < func->signature().output_arg_size(); ++i) {
-                 // const OpDef::ArgDef &arg = func->signature().output_arg(i);
-                  foutputs.emplace(node.name());                   // Fac
-                  foutputs.emplace(strings::StrCat(node.name(), ":", i));      // Fac:i
-                  //foutputs.emplace(strings::StrCat(node.name(), ":", arg.name(), ":", i));      // Fac:outarg:i
-                }
-              }
-            }
-            return Status::OK();
+        const FunctionDefLibrary* library_;
+        std::unordered_map<string, const FunctionDef*> functions_;
+
+        TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+  };
+
+  // Copy input/output argument type to the type. Return error if argument
+  // type is not explicitly defined, and not specified in function attributes.
+  Status CopyArgType(const NodeDef& func_node,
+                     const std::unordered_map<string, AttrValue>& func_attr,
+                     const string& arg_kind, const OpDef::ArgDef& arg,
+                     DataType* type) {
+    if (arg.type() != DT_INVALID) {
+      *type = arg.type();
+    } else {
+      auto it = func_attr.find(arg.type_attr());
+      if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+        return errors::InvalidArgument(
+                "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+                func_node.op(), " instantiated by ", func_node.name());
+      }
+      *type = it->second.type();
+    }
+    return Status::OK();
+  }
+
+  // Copy input/output argument type to the type_list. Return error if argument
+  // type is not explicitly defined, and not specified in function attributes.
+  Status CopyArgTypeN(const NodeDef& func_node,
+                     const std::unordered_map<string, AttrValue>& func_attr,
+                     const string& arg_kind, const OpDef::ArgDef& arg,
+                     AttrValue::ListValue* type_list) {
+    if (arg.type() != DT_INVALID) {
+      type_list->add_type(arg.type());
+    } else {
+      auto it = func_attr.find(arg.type_attr());
+      if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+        return errors::InvalidArgument(
+                "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+                func_node.op(), " instantiated by ", func_node.name());
+      }
+      type_list->add_type(it->second.type());
+    }
+    return Status::OK();
+  }
+
+  string ParseString(string input) {
+
+    size_t pos = 0;
+    std::string res = "";
+    std::string delimiter = ":";
+
+    if ((pos = input.find(delimiter)) != std::string::npos) {
+      res = res + input.substr(0, pos);
+      input.erase(0, pos + delimiter.length());
+      res = res + "/Ret" + input;
+    }
+    else {
+      res = input + "/Ret0";
+    }
+    return res;
+  }
+
+  Status GatherOutputs(std::set<string> &foutputs, const GrapplerItem& item,
+                       const FunctionInliningContext& function_inlining_ctx) {
+    for (const NodeDef& node : item.graph.node()) {
+      const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
+      if (func != nullptr) {      // If it's a function calling node
+
+        for (int i = 0; i < func->signature().output_arg_size(); ++i) {
+         // const OpDef::ArgDef &arg = func->signature().output_arg(i);
+          foutputs.emplace(node.name());                   // Fac
+          foutputs.emplace(strings::StrCat(node.name(), ":", i));      // Fac:i
+          //foutputs.emplace(strings::StrCat(node.name(), ":", arg.name(), ":", i));      // Fac:outarg:i
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
+                        std::unordered_map<string, FuncInfo> &functions_in) {
+    const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
+
+    DataType type;
+    ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
+
+    for (int i = 0; i < func.signature().input_arg_size(); ++i) {
+      const OpDef::ArgDef &arg = func.signature().input_arg(i);
+
+      // Create and add in graph a Call node for every input arg
+      NodeDef *call = optimized_graph->add_node();
+      call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
+      call->set_op("Call");
+      call->set_device(func_node.device());
+      call->add_input(func_node.input(i));
+      TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
+      (*call->mutable_attr())["T"].set_type(type);
+      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+      (*call->mutable_attr())["is_constant"].set_b(false);
+
+      NodeDef* merge = argmerge_map[arg.name()];
+      merge->add_input(call->name());
+    }
+
+    for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+      const OpDef::ArgDef &arg = func.signature().output_arg(i);
+
+      NodeDef *ret = optimized_graph->add_node();
+      ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
+      ret->set_op("Return");
+      ret->set_device(func_node.device());
+      // Counting on the fact that op name will be the same as the name given initially to function
+      ret->add_input(strings::StrCat(func_node.op(), "/", functions_in[func_node.op()].fetch[i]));
+      TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
+      (*ret->mutable_attr())["T"].set_type(type);
+      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+
+    }
+    return Status::OK();
+  }
+
+
+  Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
+                        GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> &functions_in) {
+
+    const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
+
+    std::unique_ptr<GrapplerItem> item = GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
+    if (!item) {
+      return errors::InvalidArgument("Failed to inline function ", func_node.op(), " instantiated by ", func_node.name());
+    }
+
+    std::set<string> foutputs;
+    GatherOutputs(foutputs, *item, ctx);
+
+    DataType type;
+    std::unordered_map<string, int> input_nodes;
+    functions_in[func_node.op()].fetch = item->fetch;
+    ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
+
+    for (int i = 0; i < func.signature().input_arg_size(); ++i) {
+      const OpDef::ArgDef& arg = func.signature().input_arg(i);
+
+      input_nodes[arg.name()] = i;
+
+      // Create and add in graph a Call node for every input arg
+      NodeDef* call = optimized_graph->add_node();
+      call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
+      call->set_op("Call");
+      call->set_device(func_node.device());
+      call->add_input(func_node.input(i));
+      TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
+      (*call->mutable_attr())["T"].set_type(type);
+      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+      (*call->mutable_attr())["is_constant"].set_b(false);
+
+      // Create and add a temporary merge node (IdentityN) for every input arg
+      NodeDef* merge = optimized_graph->add_node();
+      merge->set_name(strings::StrCat(func_node.name(), "/", "Merge_", i));
+      merge->set_op("IdentityN");
+      merge->set_device(func_node.device());
+      merge->add_input(call->name());
+
+      argmerge_map.emplace(arg.name(), merge);
+    }
+
+
+    for (NodeDef& func_body_node : *item->graph.mutable_node()) {
+
+      // If the func body node is func's input argument
+      if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+        CHECK_EQ(0, func_body_node.input_size());
+        // Turn input placeholders into identity nodes
+        if (IsPlaceholder(func_body_node)) {
+          func_body_node.set_op("Identity");
+        }
+        // Connect merge with input arg
+        func_body_node.add_input(argmerge_map[func_body_node.name()]->name());
+      }
+
+      // Else if not an input_arg_node
+      else {
+        // Update the input names if any.
+        for (string& input : *func_body_node.mutable_input()) {
+
+          // If it takes input from a function
+          if (foutputs.find(input) != foutputs.end()) {
+            input = ParseString(input);
           }
+          input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+        }
 
-
-          Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
-                                std::unordered_map<string, FuncInfo> &functions_in) {
-
-//            printf("Recursion Detected\n");
-
-            const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
-
-            DataType type;
-            ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
-
-            for (int i = 0; i < func.signature().input_arg_size(); ++i) {
-              const OpDef::ArgDef &arg = func.signature().input_arg(i);
-
-              // Create and add in graph a Call node for every input arg
-              NodeDef *call = optimized_graph->add_node();
-              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
-              call->set_op("Call");
-              call->set_device(func_node.device());
-              call->add_input(func_node.input(i));
-              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
-              (*call->mutable_attr())["T"].set_type(type);
-              (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
-              (*call->mutable_attr())["is_constant"].set_b(false);
-
-              NodeDef* merge = argmerge_map[arg.name()];
-              merge->add_input(call->name());
-            }
-
-            for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-              const OpDef::ArgDef &arg = func.signature().output_arg(i);
-
-              NodeDef *ret = optimized_graph->add_node();
-              ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
-              ret->set_op("Return");
-              ret->set_device(func_node.device());
-              // Counting on the fact that op name will be the same as the name given initially to function
-              ret->add_input(strings::StrCat(func_node.op(), "/", functions_in[func_node.op()].fetch[i]));
-              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
-              (*ret->mutable_attr())["T"].set_type(type);
-              (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
-
-            }
-
-            return Status::OK();
+        // If the node has no input, make hook it up to the Merge nodes to ensure
+        // it runs in the same frame as the other nodes of the function body.
+        if (func_body_node.input_size() == 0) {
+          for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it) {
+            *func_body_node.add_input() = AsControlDependency(it->second->name());
           }
+        }
+      }
 
+      // Add the node name as a prefix to avoid collisions after inlining
+      func_body_node.set_name(strings::StrCat(func_node.name(), "/", func_body_node.name()));
 
-          Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
-                              GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> &functions_in) {
+      // Make sure the node is placed
+      func_body_node.set_device(func_node.device());
 
-            const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
+      // Check if a body node is itself a function
+      const FunctionDef* func_body_node_func = ctx.FindInlinedFunction(func_body_node.op());
 
-            std::unique_ptr<GrapplerItem> item = GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
-            if (!item) {
-              return errors::InvalidArgument("Failed to inline function ", func_node.op(), " instantiated by ", func_node.name());
-            }
+      // Node is yet another function
+      if (func_body_node_func != nullptr) {
 
-            std::set<string> foutputs;
-            GatherOutputs(foutputs, *item, ctx);
-
-            DataType type;
-            std::unordered_map<string, int> input_nodes;
-            functions_in[func_node.op()].fetch = item->fetch;
-            ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
-
-            for (int i = 0; i < func.signature().input_arg_size(); ++i) {
-              const OpDef::ArgDef& arg = func.signature().input_arg(i);
-
-              input_nodes[arg.name()] = i;
-
-              // Create and add in graph a Call node for every input arg
-              NodeDef* call = optimized_graph->add_node();
-              call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
-              call->set_op("Call");
-              call->set_device(func_node.device());
-              call->add_input(func_node.input(i));
-              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
-              (*call->mutable_attr())["T"].set_type(type);
-              (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
-              (*call->mutable_attr())["is_constant"].set_b(false);
-
-              // Create and add a temporary merge node (IdentityN) for every input arg
-              NodeDef* merge = optimized_graph->add_node();
-              merge->set_name(strings::StrCat(func_node.name(), "/", "Merge_", i));
-              merge->set_op("IdentityN");
-              merge->set_device(func_node.device());
-              merge->add_input(call->name());
-
-              argmerge_map.emplace(arg.name(), merge);
-            }
+        // Check if that function has already been inlined
+        auto it = functions_in.find(func_body_node.op());
 
+        // Not already in => Inline it
+        if (it == functions_in.end()) {
+          FuncInfo func_info;
+          functions_in.emplace(func_body_node.op(), func_info);
+          InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in);
+          functions_in.erase(func_body_node.op());
+        } else {
+          // Already in -> Insert Enter/Exit ops end create cycle
+          //  (recursion or mutually recursive functions)
+          CreateCycle(func_body_node, *func_body_node_func, optimized_graph, functions_in);
+        }
+      } else {
+        // Move the node to the main graph
+        optimized_graph->add_node()->Swap(&func_body_node);
+      }
+    }
+
+    for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+      const OpDef::ArgDef &arg = func.signature().output_arg(i);
+
+      NodeDef *ret = optimized_graph->add_node();
+      ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
+      ret->set_op("Return");
+      ret->set_device(func_node.device());
+      // If it takes input from a function
+      string input = item->fetch[i];
+      if (foutputs.find(input) != foutputs.end()) {
+        input = ParseString(input);
+      }
+
+      ret->add_input(strings::StrCat(func_node.name(), "/", input));
+      TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
+      (*ret->mutable_attr())["T"].set_type(type);
+      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+    }
+
+    // Break IdentityN Merges into multiple common Binary Merge ops
+    int j=0;
+    for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
+
+      DataType type;
+      NodeDef *new_merge, *merge = it->second;
+      int i, size = merge->input_size();
+      TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", func.signature().input_arg(j), &type));
+
+      // If there is only one call site
+      if (size < 2) {
+        merge->set_op("Identity");
+        merge->set_device(func_node.device());
+        (*merge->mutable_attr())["T"].set_type(type);
+      }
+
+      else {
+
+        string name = merge->name();
+        string in1 = merge->input(0), in2;
+
+        for (i = 1; i < size-1; i++) {
+
+          in2 = merge->input(i);
+          new_merge = optimized_graph->add_node();
+
+          name = strings::StrCat(name, size - i - 1);
+          new_merge->set_name(name);
+          new_merge->set_op("Merge");
+          new_merge->set_device(func_node.device());
+          new_merge->add_input(in1);
+          new_merge->add_input(in2);
+          (*new_merge->mutable_attr())["T"].set_type(type);
+          (*new_merge->mutable_attr())["N"].set_i(2);
+
+          in1 = name;
+        }
 
-            for (NodeDef& func_body_node : *item->graph.mutable_node()) {
+        // Modify initial Merge
+        in2 = merge->input(i);
+        merge->set_op("Merge");
+        merge->set_device(func_node.device());
+        merge->clear_input();
+        merge->add_input(in1);
+        merge->add_input(in2);
+        (*merge->mutable_attr())["T"].set_type(type);
+        (*merge->mutable_attr())["N"].set_i(2);
+      }
+    }
 
-              // If the func body node is func's input argument
-              if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
-                CHECK_EQ(0, func_body_node.input_size());
-                // Turn input placeholders into identity nodes
-                if (IsPlaceholder(func_body_node)) {
-                  func_body_node.set_op("Identity");
-                }
-                // Connect merge with input arg
-                func_body_node.add_input(argmerge_map[func_body_node.name()]->name());
-              }
+    return Status::OK();
+  }
 
-              // Else if not an input_arg_node
-              else {
-                // Update the input names if any.
-                for (string& input : *func_body_node.mutable_input()) {
-
-                  // If it takes input from a function
-                  if (foutputs.find(input) != foutputs.end()) {
-                    input = ParseString(input);
-                  }
-                  input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
-                }
-
-                // If the node has no input, make hook it up to the Merge nodes to ensure
-                // it runs in the same frame as the other nodes of the function body.
-                if (func_body_node.input_size() == 0) {
-                  for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it) {
-                    *func_body_node.add_input() = AsControlDependency(it->second->name());
-                  }
-                }
-              }
+}  // namespace
 
-              // Add the node name as a prefix to avoid collisions after inlining
-              func_body_node.set_name(strings::StrCat(func_node.name(), "/", func_body_node.name()));
-
-              // Make sure the node is placed
-              func_body_node.set_device(func_node.device());
-
-              // Check if a body node is itself a function
-              const FunctionDef* func_body_node_func = ctx.FindInlinedFunction(func_body_node.op());
-
-              // Node is yet another function
-              if (func_body_node_func != nullptr) {
-
-                // Check if that function has already been inlined
-                auto it = functions_in.find(func_body_node.op());
-
-                // Not already in => Inline it
-                if (it == functions_in.end()) {
-                  FuncInfo func_info;
-                  functions_in.emplace(func_body_node.op(), func_info);
-                  InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in);
-                  functions_in.erase(func_body_node.op());
-                }
-                // Already in -> Insert Enter/Exit ops end create cycle
-                //  (recursion or mutually recursive functions)
-                else {
-                  CreateCycle(func_body_node, *func_body_node_func, optimized_graph, functions_in);
-                }
-              }
 
-              else {
-                // Move the node to the main graph
-                optimized_graph->add_node()->Swap(&func_body_node);
-              }
-            }
+  Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                          GraphDef* optimized_graph) {
 
-            for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-              const OpDef::ArgDef &arg = func.signature().output_arg(i);
-
-              NodeDef *ret = optimized_graph->add_node();
-              ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
-              ret->set_op("Return");
-              ret->set_device(func_node.device());
-              // If it takes input from a function
-              string input = item->fetch[i];
-              if (foutputs.find(input) != foutputs.end()) {
-                input = ParseString(input);
-              }
+    FunctionInliningContext function_inlining_ctx(item);
 
-              ret->add_input(strings::StrCat(func_node.name(), "/", input));
-              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
-              (*ret->mutable_attr())["T"].set_type(type);
-              (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
-            }
+    std::set<string> foutputs;
+    GatherOutputs(foutputs, item, function_inlining_ctx);
 
-            // Break IdentityN Merges into multiple common Binary Merge ops
-            int j=0;
-            for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
+  //std::cout << foutputs.size() << '\n';
+  //for( const auto& str : foutputs ) std::cout << str << '\n';
 
-              DataType type;
-              NodeDef *new_merge, *merge = it->second;
-              int i, size = merge->input_size();
-              TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", func.signature().input_arg(j), &type));
+    // Nothing to do here.
+    if (!function_inlining_ctx.HasInlinedFunctions()) {
+      *optimized_graph = item.graph;
+      return Status::OK();
+    }
 
-              // If there is only one call site
-              if (size < 2) {
-                merge->set_op("Identity");
-                merge->set_device(func_node.device());
-                (*merge->mutable_attr())["T"].set_type(type);
-              }
+    std::unordered_map<string, FuncInfo> functions_in;
 
-              else {
-
-                string name = merge->name();
-                string in1 = merge->input(0), in2;
-
-                for (i = 1; i < size-1; i++) {
-
-                  in2 = merge->input(i);
-                  new_merge = optimized_graph->add_node();
-
-                  name = strings::StrCat(name, size - i - 1);
-                  new_merge->set_name(name);
-                  new_merge->set_op("Merge");
-                  new_merge->set_device(func_node.device());
-                  new_merge->add_input(in1);
-                  new_merge->add_input(in2);
-                  (*new_merge->mutable_attr())["T"].set_type(type);
-                  (*new_merge->mutable_attr())["N"].set_i(2);
-
-                  in1 = name;
-                }
-
-                // Modify initial Merge
-                in2 = merge->input(i);
-                merge->set_op("Merge");
-                merge->set_device(func_node.device());
-                merge->clear_input();
-                merge->add_input(in1);
-                merge->add_input(in2);
-                (*merge->mutable_attr())["T"].set_type(type);
-                (*merge->mutable_attr())["N"].set_i(2);
+    // Copying node cause I need to make changes on it
+    for (NodeDef node : item.graph.node()) {
+      for (string& input : *node.mutable_input()) {
+        // If it takes input from a function
+        if (foutputs.find(input) != foutputs.end()) {
+          input = ParseString(input);
+        }
+      }
+
+      const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
+      if (func != nullptr) {
+        FuncInfo func_info;
+        functions_in.emplace(node.op(), func_info);
+        InlineFunction(node, *func, function_inlining_ctx, optimized_graph, functions_in);
+        functions_in.erase(node.op());      // At this point functions_in will be empty
+
+        // Check if the function node corresponded to some fetch_outputs
+        // before transformation occurred
+        NodeDef *idN;
+        bool created = false;
+        const std::unordered_map<string, AttrValue> func_attr(node.attr().begin(), node.attr().end());
+
+        for (size_t i = 0; i < item.fetch.size(); ++i) {
+          const string &t = item.fetch[i];
+          // Parse t into node_name and output_index.
+          TensorId id(ParseTensorName(t));
+
+          if (node.name() == id.first) {
+
+            if (created == false) {
+              idN = optimized_graph->add_node();
+              idN->set_op("IdentityN");
+              idN->set_name(node.name());
+              idN->set_device(node.device());
+
+              AttrValue::ListValue* type_list = (*idN->mutable_attr())["T"].mutable_list();
+              for (const OpDef::ArgDef& arg : func->signature().output_arg()) {
+                TF_RETURN_IF_ERROR(CopyArgTypeN(node, func_attr, "input", arg, type_list));
               }
-            }
-
-            return Status::OK();
-          }
-
-        }  // namespace
-
-
-        Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                                GraphDef* optimized_graph) {
-
-          FunctionInliningContext function_inlining_ctx(item);
-
-          std::set<string> foutputs;
-          GatherOutputs(foutputs, item, function_inlining_ctx);
 
-//std::cout << foutputs.size() << '\n';
-//for( const auto& str : foutputs ) std::cout << str << '\n';
+              idN->add_input(strings::StrCat(node.name(), "/Ret", id.second));
 
-          // Nothing to do here.
-          if (!function_inlining_ctx.HasInlinedFunctions()) {
-            *optimized_graph = item.graph;
-            return Status::OK();
-          }
-
-          std::unordered_map<string, FuncInfo> functions_in;
-
-          // Copying node cause I need to make changes on it
-          for (NodeDef node : item.graph.node()) {
-            for (string& input : *node.mutable_input()) {
-              // If it takes input from a function
-              if (foutputs.find(input) != foutputs.end()) {
-                input = ParseString(input);
-              }
-            }
-
-            const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
-            if (func != nullptr) {
-              FuncInfo func_info;
-              functions_in.emplace(node.op(), func_info);
-              InlineFunction(node, *func, function_inlining_ctx, optimized_graph, functions_in);
-              functions_in.erase(node.op());      // At this point functions_in will be empty
-
-              // Check if the function node corresponded to some fetch_outputs
-              // before transformation occurred
-              NodeDef *idN;
-              bool created = false;
-              const std::unordered_map<string, AttrValue> func_attr(node.attr().begin(), node.attr().end());
-
-              for (size_t i = 0; i < item.fetch.size(); ++i) {
-                const string &t = item.fetch[i];
-                // Parse t into node_name and output_index.
-                TensorId id(ParseTensorName(t));
-
-                if (node.name() == id.first) {
-
-                  if (created == false) {
-                    idN = optimized_graph->add_node();
-                    idN->set_op("IdentityN");
-                    idN->set_name(node.name());
-                    idN->set_device(node.device());
-
-                    AttrValue::ListValue* type_list = (*idN->mutable_attr())["T"].mutable_list();
-                    for (const OpDef::ArgDef& arg : func->signature().output_arg()) {
-                      TF_RETURN_IF_ERROR(CopyArgTypeN(node, func_attr, "input", arg, type_list));
-                    }
-
-                    idN->add_input(strings::StrCat(node.name(), "/Ret", id.second));
-
-                    created = true;
-                  } else {
-                    idN->add_input(strings::StrCat(node.name(), "/Ret", id.second));
-                  }
-                }
-              }
-            }
-            else {
-              *optimized_graph->add_node() = node;
+              created = true;
+            } else {
+              idN->add_input(strings::StrCat(node.name(), "/Ret", id.second));
             }
           }
-
-          *optimized_graph->mutable_versions() = item.graph.versions();
-          *optimized_graph->mutable_library() = item.graph.library();
-
-
-
-          /******************************************************************************************************/
-          // Dumps optimized graph in a not so readable form
-//          const GraphDef* tmp = optimized_graph;
-//          printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
-
-          // Write an event, so that we can visualize this optimized graph in tensorboard
-          EventsWriter writer("TRANSFORMATION");
-          Event event;
-          event.set_wall_time(1234);
-          event.set_step(34);
-
-          const size_t proto_size = optimized_graph->ByteSizeLong();
-          void* buf = port::Malloc(proto_size);
-          if (buf == nullptr) {
-            return tensorflow::errors::ResourceExhausted("Failed to allocate memory to serialize message of type '"
-                                                         ,optimized_graph->GetTypeName(), "' and size ", proto_size);
-          }
-          optimized_graph->SerializeToArray(buf, proto_size);
-          const void* bf = buf;
-          event.set_graph_def(bf, proto_size);
-          writer.WriteEvent(event);
-          /******************************************************************************************************/
-
-          return Status::OK();
         }
-
-        void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
-                                         const GraphDef& optimized_graph,
-                                         double result) {
-          // Nothing to do for FunctionOptimizer.
-        }
-
-    }  // end namespace grappler
+      }
+      else {
+        *optimized_graph->add_node() = node;
+      }
+    }
+
+    *optimized_graph->mutable_versions() = item.graph.versions();
+    *optimized_graph->mutable_library() = item.graph.library();
+
+    /******************************************************************************************************/
+    // Dumps optimized graph in a not so readable form
+    // const GraphDef* tmp = optimized_graph;
+    // printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
+
+    // Write an event, so that we can visualize this optimized graph in tensorboard
+    EventsWriter writer("TRANSFORMATION");
+    Event event;
+    event.set_wall_time(1234);
+    event.set_step(34);
+
+    const size_t proto_size = optimized_graph->ByteSizeLong();
+    void* buf = port::Malloc(proto_size);
+    if (buf == nullptr) {
+      return tensorflow::errors::ResourceExhausted("Failed to allocate memory to serialize message of type '"
+                                                   ,optimized_graph->GetTypeName(), "' and size ", proto_size);
+    }
+    optimized_graph->SerializeToArray(buf, proto_size);
+    const void* bf = buf;
+    event.set_graph_def(bf, proto_size);
+    writer.WriteEvent(event);
+    /******************************************************************************************************/
+
+    return Status::OK();
+  }
+
+  void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                   const GraphDef& optimized_graph,
+                                   double result) {
+    // Nothing to do for FunctionOptimizer.
+  }
+}  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.h b/tensorflow/core/grappler/optimizers/function_transformation.h
index 9c13372572..245b37e369 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.h
+++ b/tensorflow/core/grappler/optimizers/function_transformation.h
@@ -20,8 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
-  namespace grappler {
-
+namespace grappler {
 
   typedef std::unordered_map<string, NodeDef*> ArgMergeMap;
 
@@ -30,23 +29,22 @@ namespace tensorflow {
       gtl::ArraySlice<string> fetch;
   } FuncInfo;
 
-// Replace function calling nodes with pairs of new 'Call/Return' operators
-
+  // Replace function calling nodes with pairs of new 'Call/Return' operators
 	class FunctionTransformation : public GraphOptimizer {
-	public:
-		FunctionTransformation() {}
-		~FunctionTransformation() override {}
+  	public:
+  		FunctionTransformation() {}
+  		~FunctionTransformation() override {}
 
-		string name() const override { return "function_transformation"; };
+  		string name() const override { return "function_transformation"; };
 
-		Status Optimize(Cluster* cluster, const GrapplerItem& item,
-						GraphDef* optimized_graph) override;
+  		Status Optimize(Cluster* cluster, const GrapplerItem& item,
+  						GraphDef* optimized_graph) override;
 
-		void Feedback(Cluster* cluster, const GrapplerItem& item,
-					  const GraphDef& optimized_graph, double result) override;
+  		void Feedback(Cluster* cluster, const GrapplerItem& item,
+  					  const GraphDef& optimized_graph, double result) override;
 	};
 
-  }  // end namespace grappler
+}  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_TRANSFORMATION_H_
\ No newline at end of file
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_TRANSFORMATION_H_

From 4a9907c29ab00e6c2665ec4d9b79afbffc384e67 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:42:59 +0300
Subject: [PATCH 25/64] Fixes (possibly) the conflict with r1.4

---
 tensorflow/python/framework/function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index e59082114e..d10771b078 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -25,9 +25,9 @@
 import hashlib
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import op_def_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
-from tensorflow.core.framework import op_def_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_to_function_def

From b12cc772604583cd2a95e0949b1f9a6028798c84 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 16:44:13 +0300
Subject: [PATCH 26/64] Fixes (possibly) the conflict with r1.4

---
 tensorflow/python/framework/function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index d10771b078..8930d20711 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -24,8 +24,8 @@
 import collections
 import hashlib
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import op_def_pb2
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes

From e7a9899851b3a94cb8f6943af2e3347a5092eb9c Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 18:23:08 +0300
Subject: [PATCH 27/64] Split GetNextIterationCallNode to two separate
 functions

---
 tensorflow/core/graph/graph_constructor.cc | 36 ++++++++++++++++------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 475ba9acbf..c0e59518ab 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -385,26 +385,42 @@ Status GraphConstructor::BuildNodeIndex() {
   return Status::OK();
 }
 
-std::unordered_set<string> GetNextIterationCallNodes(
+std::unordered_set<string> GetNextIterationNodes(
     const GraphConstructor::NodeDefSlice& node_defs) {
-  std::unordered_set<string> next_iteration_call_nodes;
+  std::unordered_set<string> next_iteration_nodes;
 
   for (int n = 0; n < node_defs.size(); ++n) {
     const NodeDef& node_def = *node_defs[n];
-    if (IsNextIteration(node_def) || IsCall(node_def)) {
-      next_iteration_call_nodes.insert(node_def.name());
+    if (IsNextIteration(node_def)) {
+      next_iteration_nodes.insert(node_def.name());
     }
   }
 
-  return next_iteration_call_nodes;
+  return next_iteration_nodes;
+}
+
+std::unordered_set<string> GetCallNodes(
+    const GraphConstructor::NodeDefSlice& node_defs) {
+  std::unordered_set<string> call_nodes;
+
+  for (int n = 0; n < node_defs.size(); ++n) {
+    const NodeDef& node_def = *node_defs[n];
+    if (IsCall(node_def)) {
+      call_nodes.insert(node_def.name());
+    }
+  }
+
+  return next_call_nodes;
 }
 
 Status GraphConstructor::InitFromEdges() {
   const int num_nodes = node_defs_.size();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
-  std::unordered_set<string> next_iteration_call_nodes_ =
-      GetNextIterationCallNodes(node_defs_);
+  std::unordered_set<string> next_iteration_nodes_ =
+      GetNextIterationNodes(node_defs_);
+  std::unordered_set<string> call_nodes_ =
+      GetCallNodes(node_defs_);
 
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
@@ -423,8 +439,10 @@ Status GraphConstructor::InitFromEdges() {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_call_nodes_.find(id.first.ToString()) !=
-              next_iteration_call_nodes_.end()) {
+          if (next_iteration_nodes_.find(id.first.ToString()) !=
+              next_iteration_nodes_.end() ||
+              call_nodes_.find(id.first.ToString()) !=
+              call_nodes_.end()) {
             has_loop_back_edge = true;
           }
         }

From 5ac76f6f16c50cae93e28bea5c2522a1d70d1cb1 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 19:07:12 +0300
Subject: [PATCH 28/64] Update functions.{cc,h} with the original files.

Hopefully, there was only wrong indentation, diff was super confused.
Commit-used: 6a6661bbdce2172d27bf501e26baf09e8a658657

TODO: Checkout whether it is possible to bump with the most up-to-date
version of. It seems that after that commit, there was heavy development
in these files.
---
 tensorflow/core/grappler/utils/functions.cc | 237 +++++++++-----------
 tensorflow/core/grappler/utils/functions.h  |  14 +-
 2 files changed, 119 insertions(+), 132 deletions(-)

diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 8333ddf134..37b00e0a30 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -26,128 +26,115 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
-		namespace grappler {
-
-				std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-								const FunctionDef& func,
-								const std::unordered_map<string, AttrValue>& func_attr,
-								const FunctionDefLibrary& library) {
-					if (func.signature().name().empty()) {
-						LOG(ERROR) << "function name must be specified.";
-						return nullptr;
-					}
-					std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-					new_item->id = func.signature().name();
-
-					std::unordered_map<string, string> port_map;
-
-					// Add the function inputs as placeholder
-					for (const auto& inp : func.signature().input_arg()) {
-						NodeDef* ph = new_item->graph.add_node();
-						ph->set_name(inp.name());
-						ph->set_op("Placeholder");
-						if (inp.type() != DT_INVALID) {
-							(*ph->mutable_attr())["T"].set_type(inp.type());
-						} else {
-							auto it = func_attr.find(inp.type_attr());
-							if (it == func_attr.end()) {
-								LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
-													 << " for function input " << inp.name();
-								return nullptr;
-							} else {
-								(*ph->mutable_attr())["T"] = it->second;
-							}
-						}
-						port_map[inp.name()] = inp.name();
-					}
-
-					// Add the function body to the graph.
-					FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
-
-					for (const NodeDef& node : func.node_def()) {
-						NodeDef* new_node = new_item->graph.add_node();
-						*new_node = node;
-						// Replace the placeholder attribute values with the specified value.
-						for (auto& attr : *new_node->mutable_attr()) {
-							const string& ph_name = attr.second.placeholder();
-							auto it = func_attr.find(ph_name);
-							if (it != func_attr.end()) {
-								attr.second = it->second;
-							}
-						}
-
-						// Functions use a custom format to encode connectivity. Map these custom
-						// strings to regular ones.
-						const OpRegistrationData* registration;
-						Status status = func_def.LookUp(node.op(), &registration);
-						if (!status.ok()) {
-							LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
-							return nullptr;
-						}
-
-						tensorflow::NameRangeMap inputs;
-						tensorflow::NameRangeMap outputs;
-						status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
-																									 &outputs);
-						if (!status.ok()) {
-							LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
-							return nullptr;
-						}
-						for (const auto& name_range : outputs) {
-							string port_prefix =
-											strings::StrCat(node.name(), ":", name_range.first, ":");
-							int index_start = name_range.second.first;
-							int index_end = name_range.second.second;
-							for (int i = index_start; i < index_end; ++i) {
-								string port_id = strings::StrCat(port_prefix, i - index_start);
-								string port_name = strings::StrCat(node.name(), ":", i);
-								port_map[port_id] = port_name;
-							}
-						}
-					}
-
-					for (auto& node : *new_item->graph.mutable_node()) {
-						// Rewrite the inputs to use the normal naming convention.
-						for (int i = 0; i < node.input_size(); ++i) {
-							const string& input = node.input(i);
-							if (IsControlInput(input)) {
-								// No need to remap control dependencies.
-								continue;
-							} else {
-								auto it = port_map.find(input);
-								if (it == port_map.end()) {
-									LOG(ERROR) << "Unknown input: " << input;
-									return nullptr;
-								}
-								node.set_input(i, it->second);
-							}
-						}
-					}
-
-					// Add the function outputs to the list of fetch nodes, taking into account
-					// the output mapping if any.
-					for (const auto& out : func.signature().output_arg()) {
-						auto it = func.ret().find(out.name());
-						if (it != func.ret().end()) {
-							auto it2 = port_map.find(it->second);
-							if (it2 == port_map.end()) {
-								LOG(ERROR) << "Unknown output mapping: " << it->first << " to "
-													 << it->second;
-								return nullptr;
-							} else {
-								new_item->fetch.emplace_back(it2->second);
-							}
-						} else {
-							new_item->fetch.emplace_back(out.name());
-						}
-					}
-					// Add the function inputs to the list of feeds.
-					for (const auto& inp : func.signature().input_arg()) {
-						new_item->feed.emplace_back(inp.name(), Tensor());
-					}
-
-					return new_item;
-				}
-
-		}  // end namespace grappler
-}  // end namespace tensorflow
\ No newline at end of file
+namespace grappler {
+
+std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionDefLibrary& library) {
+  if (func.signature().name().empty()) {
+    LOG(ERROR) << "function name must be specified.";
+    return nullptr;
+  }
+  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
+  new_item->id = func.signature().name();
+
+  std::unordered_map<string, string> port_map;
+
+  // Add the function inputs as placeholder
+  for (const auto& inp : func.signature().input_arg()) {
+    NodeDef* ph = new_item->graph.add_node();
+    ph->set_name(inp.name());
+    ph->set_op("Placeholder");
+    if (inp.type() != DT_INVALID) {
+      (*ph->mutable_attr())["T"].set_type(inp.type());
+    } else {
+      auto it = func_attr.find(inp.type_attr());
+      if (it == func_attr.end()) {
+        LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
+                   << " for function input " << inp.name();
+        return nullptr;
+      } else {
+        (*ph->mutable_attr())["T"] = it->second;
+      }
+    }
+    port_map[inp.name()] = inp.name();
+  }
+
+  // Add the function body to the graph.
+  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+
+  for (const NodeDef& node : func.node_def()) {
+    NodeDef* new_node = new_item->graph.add_node();
+    *new_node = node;
+    // Replace the placeholder attribute values with the specified value.
+    for (auto& attr : *new_node->mutable_attr()) {
+      const string& ph_name = attr.second.placeholder();
+      auto it = func_attr.find(ph_name);
+      if (it != func_attr.end()) {
+        attr.second = it->second;
+      }
+    }
+
+    // Functions use a custom format to encode connectivity. Map these custom
+    // strings to regular ones.
+    const OpRegistrationData* registration;
+    Status status = func_def.LookUp(node.op(), &registration);
+    if (!status.ok()) {
+      LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
+      return nullptr;
+    }
+
+    tensorflow::NameRangeMap inputs;
+    tensorflow::NameRangeMap outputs;
+    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
+                                           &outputs);
+    if (!status.ok()) {
+      LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
+      return nullptr;
+    }
+    for (const auto& name_range : outputs) {
+      string port_prefix =
+          strings::StrCat(node.name(), ":", name_range.first, ":");
+      int index_start = name_range.second.first;
+      int index_end = name_range.second.second;
+      for (int i = index_start; i < index_end; ++i) {
+        string port_id = strings::StrCat(port_prefix, i - index_start);
+        string port_name = strings::StrCat(node.name(), ":", i);
+        port_map[port_id] = port_name;
+      }
+    }
+  }
+
+  for (auto& node : *new_item->graph.mutable_node()) {
+    // Rewrite the inputs to use the normal naming convention.
+    for (int i = 0; i < node.input_size(); ++i) {
+      const string& input = node.input(i);
+      if (IsControlInput(input)) {
+        // No need to remap control dependencies.
+        continue;
+      } else {
+        auto it = port_map.find(input);
+        if (it == port_map.end()) {
+          LOG(ERROR) << "Unknown input: " << input;
+          return nullptr;
+        }
+        node.set_input(i, it->second);
+      }
+    }
+  }
+
+  // Add the function outputs to the list of fetch nodes.
+  for (const auto& out : func.signature().output_arg()) {
+    new_item->fetch.emplace_back(out.name());
+  }
+  // Add the function inputs to the list of feeds.
+  for (const auto& inp : func.signature().input_arg()) {
+    new_item->feed.emplace_back(inp.name(), Tensor());
+  }
+
+  return new_item;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 6d0eed3fa6..8f9b7d848a 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -24,16 +24,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-		namespace grappler {
+namespace grappler {
 
 // Factory method for creating a GrapplerItem from a FunctionDef.
 // Returns nullptr if the given function def cannot be converted.
-				std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-								const FunctionDef& func,
-								const std::unordered_map<string, AttrValue>& func_attr,
-								const FunctionDefLibrary& library);
+std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionDefLibrary& library);
 
-		}  // end namespace grappler
+}  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
\ No newline at end of file
+#endif  // TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_

From 984785296efd93adeb2050d388577612f660392e Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 19:59:12 +0300
Subject: [PATCH 29/64] Move Call/Return kernels and ops to separate file

TODO: Must check that bazel hooks are placed appropriately for the
proper registration of the new ops.
---
 tensorflow/core/BUILD                         |   3 +
 tensorflow/core/kernels/BUILD                 |  10 +
 tensorflow/core/kernels/control_flow_ops.cc   | 170 -----------------
 tensorflow/core/kernels/control_flow_ops.h    |  25 ---
 .../core/kernels/function_control_ops.cc      | 179 ++++++++++++++++++
 .../core/kernels/function_control_ops.h       |  35 ++++
 tensorflow/core/ops/control_flow_ops.cc       |  87 ---------
 tensorflow/core/ops/function_control_ops.cc   |  96 ++++++++++
 8 files changed, 323 insertions(+), 282 deletions(-)
 create mode 100644 tensorflow/core/kernels/function_control_ops.cc
 create mode 100644 tensorflow/core/kernels/function_control_ops.h
 create mode 100644 tensorflow/core/ops/function_control_ops.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b18b3cb123..cdf5914640 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -566,6 +566,7 @@ tf_gen_op_libs(
         "data_flow_ops",
         "dataset_ops",
         "function_ops",
+        "function_control_ops",
         "functional_ops",
         "image_ops",
         "io_ops",
@@ -647,6 +648,7 @@ cc_library(
         ":dataset_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
+        ":function_control_ops_op_lib",
         ":image_ops_op_lib",
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
@@ -779,6 +781,7 @@ cc_library(
         "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:function_control_ops",
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index a08e2f5ee3..a35c929bad 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1436,6 +1436,16 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "function_control_ops",
+    prefix = "function_control_ops",
+    deps = [
+        "//tensorflow/core:function_control_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "data_flow",
     deps = [
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 32fc1f3659..64c06786bc 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -587,176 +587,6 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif // TENSORFLOW_USE_SYCL
 
-/*************************************************************************************************/
-void CallOp::Compute(OpKernelContext* context) {
-  if (IsRefType(context->input_dtype(0))) {
-    context->forward_ref_input_to_ref_output(0, 0);
-  } else {
-    context->set_output(0, context->input(0));
-  }
-}
-
-REGISTER_KERNEL_BUILDER(Name("Call").Device(DEVICE_CPU), CallOp);
-REGISTER_KERNEL_BUILDER(Name("RefCall").Device(DEVICE_CPU), CallOp);
-
-#define REGISTER_GPU_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(        \
-      Name("Call").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
-#define REGISTER_GPU_REF_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(            \
-      Name("RefCall").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
-REGISTER_GPU_KERNEL(bool);
-REGISTER_GPU_REF_KERNEL(bool);
-
-#undef REGISTER_GPU_KERNEL
-#undef REGISTER_GPU_REF_KERNEL
-
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-      Name("Call").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
-      Name("RefCall").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
-REGISTER_SYCL_REF_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Call")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          CallOp)
-
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("RefCall")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          CallOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_REF_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(string);
-REGISTER_SYCL_HOST_REF_KERNEL(string);
-REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
-#define REGISTER_GPU_HOST_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("Call")                   \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          CallOp)
-
-#define REGISTER_GPU_HOST_REF_KERNEL(type)                \
-  REGISTER_KERNEL_BUILDER(Name("RefCall")                \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          CallOp)
-
-REGISTER_GPU_HOST_KERNEL(int32);
-REGISTER_GPU_HOST_REF_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(string);
-REGISTER_GPU_HOST_REF_KERNEL(string);
-REGISTER_GPU_HOST_KERNEL(ResourceHandle);
-
-#undef REGISTER_GPU_HOST_KERNEL
-#undef REGISTER_GPU_HOST_REF_KERNEL
-
-void ReturnOp::Compute(OpKernelContext* context) {
-  if (IsRefType(context->input_dtype(0))) {
-    context->forward_ref_input_to_ref_output(0, 0);
-  } else {
-    context->set_output(0, context->input(0));
-  }
-}
-
-REGISTER_KERNEL_BUILDER(Name("Return").Device(DEVICE_CPU), ReturnOp);
-REGISTER_KERNEL_BUILDER(Name("RefReturn").Device(DEVICE_CPU), ReturnOp);
-
-#define REGISTER_GPU_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(        \
-      Name("Return").Device(DEVICE_GPU).TypeConstraint<type>("T"), ReturnOp);
-#define REGISTER_GPU_REF_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(            \
-      Name("RefReturn").Device(DEVICE_GPU).TypeConstraint<type>("T"), ReturnOp);
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
-REGISTER_GPU_KERNEL(bool);
-REGISTER_GPU_REF_KERNEL(bool);
-
-#undef REGISTER_GPU_KERNEL
-#undef REGISTER_GPU_REF_KERNEL
-
-#ifdef TENSORFLOW_USE_SYCL
-    #define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Return").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ReturnOp);   \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefReturn").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ReturnOp);
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Return")                    \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ReturnOp);                        \
-  REGISTER_KERNEL_BUILDER(Name("RefReturn")                 \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ReturnOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(string);
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
-
-#define REGISTER_GPU_HOST_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("Return")                    \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ReturnOp);                        \
-  REGISTER_KERNEL_BUILDER(Name("RefReturn")                 \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ReturnOp)
-
-REGISTER_GPU_HOST_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(string);
-
-#undef REGISTER_GPU_HOST_KERNEL
-/*************************************************************************************************/
-
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
 // determines loop termination. As a contract, any high-level front-end
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index e8b6b826d9..4838f2e2bf 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -97,31 +97,6 @@ class NextIterationOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
 };
 
-// A call op has one input and one output. It creates or finds
-// the child frame that is uniquely identified by the frame_name,
-// and makes its input available to the child frame.
-class CallOp : public OpKernel {
-public:
-    explicit CallOp(OpKernelConstruction* context) : OpKernel(context) {}
-    void Compute(OpKernelContext* context) override;
-    bool IsExpensive() override { return false; }
-    ~CallOp() override {}
-
-    TF_DISALLOW_COPY_AND_ASSIGN(CallOp);
-};
-
-// A Return op has one input and one output. It exits the current
-// frame to its parent frame, and makes its input available to the
-// parent frame only if it receives a tensor with a specific tag.
-class ReturnOp : public OpKernel {
-public:
-    explicit ReturnOp(OpKernelConstruction* context) : OpKernel(context) {}
-    void Compute(OpKernelContext* context) override;
-    bool IsExpensive() override { return false; }
-    ~ReturnOp() override {}
-
-    TF_DISALLOW_COPY_AND_ASSIGN(ReturnOp);
-};
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/tensorflow/core/kernels/function_control_ops.cc b/tensorflow/core/kernels/function_control_ops.cc
new file mode 100644
index 0000000000..9d190e324b
--- /dev/null
+++ b/tensorflow/core/kernels/function_control_ops.cc
@@ -0,0 +1,179 @@
+#include "tensorflow/core/kernels/function_control_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+void CallOp::Compute(OpKernelContext* context) {
+  if (IsRefType(context->input_dtype(0))) {
+    context->forward_ref_input_to_ref_output(0, 0);
+  } else {
+    context->set_output(0, context->input(0));
+  }
+}
+
+REGISTER_KERNEL_BUILDER(Name("Call").Device(DEVICE_CPU), CallOp);
+REGISTER_KERNEL_BUILDER(Name("RefCall").Device(DEVICE_CPU), CallOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(        \
+      Name("Call").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
+#define REGISTER_GPU_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(            \
+      Name("RefCall").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_REF_KERNEL(bool);
+
+#undef REGISTER_GPU_KERNEL
+#undef REGISTER_GPU_REF_KERNEL
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(type)  \
+  REGISTER_KERNEL_BUILDER(          \
+      Name("Call").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
+REGISTER_SYCL_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+
+#define REGISTER_SYCL_REF_KERNEL(type)  \
+  REGISTER_KERNEL_BUILDER(              \
+      Name("RefCall").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
+REGISTER_SYCL_REF_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_REF_KERNEL
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Call")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefCall")                \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_REF_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+REGISTER_SYCL_HOST_REF_KERNEL(string);
+REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_SYCL_HOST_KERNEL
+#undef REGISTER_SYCL_HOST_REF_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
+#define REGISTER_GPU_HOST_KERNEL(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("Call")                   \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+#define REGISTER_GPU_HOST_REF_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("RefCall")                \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          CallOp)
+
+REGISTER_GPU_HOST_KERNEL(int32);
+REGISTER_GPU_HOST_REF_KERNEL(int32);
+REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_REF_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_GPU_HOST_KERNEL
+#undef REGISTER_GPU_HOST_REF_KERNEL
+
+void ReturnOp::Compute(OpKernelContext* context) {
+  if (IsRefType(context->input_dtype(0))) {
+    context->forward_ref_input_to_ref_output(0, 0);
+  } else {
+    context->set_output(0, context->input(0));
+  }
+}
+
+REGISTER_KERNEL_BUILDER(Name("Return").Device(DEVICE_CPU), ReturnOp);
+REGISTER_KERNEL_BUILDER(Name("RefReturn").Device(DEVICE_CPU), ReturnOp);
+
+#define REGISTER_GPU_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(        \
+      Name("Return").Device(DEVICE_GPU).TypeConstraint<type>("T"), ReturnOp);
+#define REGISTER_GPU_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(            \
+      Name("RefReturn").Device(DEVICE_GPU).TypeConstraint<type>("T"), ReturnOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+REGISTER_GPU_REF_KERNEL(bool);
+
+#undef REGISTER_GPU_KERNEL
+#undef REGISTER_GPU_REF_KERNEL
+
+#ifdef TENSORFLOW_USE_SYCL
+    #define REGISTER_SYCL_KERNEL(type)                                           \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Return").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ReturnOp);   \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("RefReturn").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ReturnOp);
+REGISTER_SYCL_KERNEL(bool);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
+#undef REGISTER_SYCL_REF_KERNEL
+
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Return")                    \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("RefReturn")                 \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp)
+
+REGISTER_SYCL_HOST_KERNEL(int32);
+REGISTER_SYCL_HOST_KERNEL(string);
+#undef REGISTER_SYCL_HOST_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
+#define REGISTER_GPU_HOST_KERNEL(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("Return")                    \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("RefReturn")                 \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ReturnOp)
+
+REGISTER_GPU_HOST_KERNEL(int32);
+REGISTER_GPU_HOST_KERNEL(string);
+
+#undef REGISTER_GPU_HOST_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/function_control_ops.h b/tensorflow/core/kernels/function_control_ops.h
new file mode 100644
index 0000000000..62aaada374
--- /dev/null
+++ b/tensorflow/core/kernels/function_control_ops.h
@@ -0,0 +1,35 @@
+#ifndef TENSORFLOW_KERNELS_FUNCTION_CONTROL_OPS_H_
+#define TENSORFLOW_KERNELS_FUNCTION_CONTROL_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// A call op has one input and one output. It creates or finds
+// the child frame that is uniquely identified by the frame_name,
+// and makes its input available to the child frame.
+class CallOp : public OpKernel {
+public:
+    explicit CallOp(OpKernelConstruction* context) : OpKernel(context) {}
+    void Compute(OpKernelContext* context) override;
+    bool IsExpensive() override { return false; }
+    ~CallOp() override {}
+
+    TF_DISALLOW_COPY_AND_ASSIGN(CallOp);
+};
+
+// A Return op has one input and one output. It exits the current
+// frame to its parent frame, and makes its input available to the
+// parent frame only if it receives a tensor with a specific tag.
+class ReturnOp : public OpKernel {
+public:
+    explicit ReturnOp(OpKernelConstruction* context) : OpKernel(context) {}
+    void Compute(OpKernelContext* context) override;
+    bool IsExpensive() override { return false; }
+    ~ReturnOp() override {}
+
+    TF_DISALLOW_COPY_AND_ASSIGN(ReturnOp);
+};
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 145017b43a..61089658d7 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -309,93 +309,6 @@ data: The tensor to be made available to the next iteration.
 output: The same tensor as `data`.
 )doc");
 
-// --------------------------------------------------------------------------
-REGISTER_OP("Call")
-    .Input("data: T")
-    .Output("output: T")
-    .Attr("T: type")
-    .Attr("frame_name: string")
-    .Attr("is_constant: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->UnknownShape());
-
-      // Handle resource shape / dtype, if present.
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      } else {
-        // Otherwise, propagate shape if output is a constant.
-        bool is_constant;
-        TF_RETURN_IF_ERROR(c->GetAttr("is_constant", &is_constant));
-        if (is_constant) {
-         c->set_output(0, c->input(0));
-        }
-      }
-      return Status::OK();
-    })
-    .Doc(R"Doc(
-Creates (or finds) a child frame, and makes `data` available to the child frame.
-
-This op is used together with `Return` to create recursive calls in the graph.
-The unique `frame_name` is used by the `Executor` to identify frames.
-
-data: The tensor to be made available to the child frame.
-frame_name: The name of the child frame.
-output: The same tensor as `data`.
-
-Returns tensors with the same shapes and contents as the input
-tensors.
-    )Doc");
-
-REGISTER_OP("RefCall")
-    .Input("data: Ref(T)")
-    .Output("output: Ref(T)")
-    .Attr("T: type")
-    .Attr("frame_name: string")
-    .Attr("is_constant: bool = false")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"Doc(
-Creates (or finds) a child frame, and makes `data` available to the child frame.
-
-This op is used together with `Return` to create recursive calls in the graph.
-The unique `frame_name` is used by the `Executor` to identify frames.
-
-data: The tensor to be made available to the child frame.
-frame_name: The name of the child frame.
-output: The same tensor as `data`.
-
-Returns tensors with the same shapes and contents as the input
-tensors.
-    )Doc");
-
-// --------------------------------------------------------------------------
-REGISTER_OP("Return")
-.Input("data: T")
-.Output("output: T")
-.Attr("T: type")
-.Attr("frame_name: string")
-.SetShapeFn(shape_inference::UnchangedShape)
-.Doc(R"Doc(
-Exits the current frame to its parent frame.
-Exit makes its input `data` available to the parent frame.
-data: The list of tensors to be made available to the parent frame.
-output: The same list of tensors as `data`.
-    )Doc");
-
-REGISTER_OP("RefReturn")
-.Input("data: Ref(T)")
-.Output("output: Ref(T)")
-.Attr("T: type")
-.Attr("frame_name: string")
-.SetShapeFn(shape_inference::UnchangedShape)
-.Doc(R"Doc(
-Exits the current frame to its parent frame.
-Exit makes its input `data` available to the parent frame.
-data: The tensors to be made available to the parent frame.
-output: The same tensors as `data`.
-    )Doc");
-
-
 // --------------------------------------------------------------------------
 REGISTER_OP("LoopCond")
     .Input("input: bool")
diff --git a/tensorflow/core/ops/function_control_ops.cc b/tensorflow/core/ops/function_control_ops.cc
new file mode 100644
index 0000000000..731fc26963
--- /dev/null
+++ b/tensorflow/core/ops/function_control_ops.cc
@@ -0,0 +1,96 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Call")
+    .Input("data: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("frame_name: string")
+    .Attr("is_constant: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+
+      // Handle resource shape / dtype, if present.
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      } else {
+        // Otherwise, propagate shape if output is a constant.
+        bool is_constant;
+        TF_RETURN_IF_ERROR(c->GetAttr("is_constant", &is_constant));
+        if (is_constant) {
+         c->set_output(0, c->input(0));
+        }
+      }
+      return Status::OK();
+    })
+    .Doc(R"Doc(
+Creates (or finds) a child frame, and makes `data` available to the child frame.
+
+This op is used together with `Return` to create recursive calls in the graph.
+The unique `frame_name` is used by the `Executor` to identify frames.
+
+data: The tensor to be made available to the child frame.
+frame_name: The name of the child frame.
+output: The same tensor as `data`.
+
+Returns tensors with the same shapes and contents as the input
+tensors.
+    )Doc");
+
+REGISTER_OP("RefCall")
+    .Input("data: Ref(T)")
+    .Output("output: Ref(T)")
+    .Attr("T: type")
+    .Attr("frame_name: string")
+    .Attr("is_constant: bool = false")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"Doc(
+Creates (or finds) a child frame, and makes `data` available to the child frame.
+
+This op is used together with `Return` to create recursive calls in the graph.
+The unique `frame_name` is used by the `Executor` to identify frames.
+
+data: The tensor to be made available to the child frame.
+frame_name: The name of the child frame.
+output: The same tensor as `data`.
+
+Returns tensors with the same shapes and contents as the input
+tensors.
+    )Doc");
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Return")
+.Input("data: T")
+.Output("output: T")
+.Attr("T: type")
+.Attr("frame_name: string")
+.SetShapeFn(shape_inference::UnchangedShape)
+.Doc(R"Doc(
+Exits the current frame to its parent frame.
+Exit makes its input `data` available to the parent frame.
+data: The list of tensors to be made available to the parent frame.
+output: The same list of tensors as `data`.
+    )Doc");
+
+REGISTER_OP("RefReturn")
+.Input("data: Ref(T)")
+.Output("output: Ref(T)")
+.Attr("T: type")
+.Attr("frame_name: string")
+.SetShapeFn(shape_inference::UnchangedShape)
+.Doc(R"Doc(
+Exits the current frame to its parent frame.
+Exit makes its input `data` available to the parent frame.
+data: The tensors to be made available to the parent frame.
+output: The same tensors as `data`.
+    )Doc");
+
+}  // namespace tensorflow

From ab40d1e71bfe85eef04672547ec7e9bccd2675c1 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 3 Jul 2018 22:51:16 +0300
Subject: [PATCH 30/64] Fix compilation hiccups

---
 tensorflow/core/graph/graph_constructor.cc | 2 +-
 tensorflow/core/kernels/BUILD              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index c0e59518ab..87074397b6 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -410,7 +410,7 @@ std::unordered_set<string> GetCallNodes(
     }
   }
 
-  return next_call_nodes;
+  return call_nodes;
 }
 
 Status GraphConstructor::InitFromEdges() {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index a35c929bad..e40d82ee0e 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1440,7 +1440,7 @@ tf_kernel_library(
     name = "function_control_ops",
     prefix = "function_control_ops",
     deps = [
-        "//tensorflow/core:function_control_op_lib",
+        "//tensorflow/core:function_control_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],

From a067cbcf01b1ac18c95f150491cdbd6087497f0d Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Wed, 4 Jul 2018 20:28:08 +0300
Subject: [PATCH 31/64] Include function_control_ops in test deps

---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index cdf5914640..faaa206afc 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2926,6 +2926,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:fifo_queue_op",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:function_control_ops",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:ops_util",

From b32196a14f786567281b6b5e4b5ef9c47eafcbbc Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 5 Jul 2018 01:30:57 +0300
Subject: [PATCH 32/64] Fixes in Build for unit testing

---
 tensorflow/core/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index cdf5914640..581cec8812 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2922,11 +2922,13 @@ tf_cc_test(
         ":testlib",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:function_control_ops",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:fifo_queue_op",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:identity_op",
+        "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/kernels:queue_ops",
@@ -2963,11 +2965,13 @@ tf_cc_test(
         # Link with support for TensorFlow Debugger (tfdbg).
         "//tensorflow/core/debug",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:function_control_ops",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:fifo_queue_op",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:identity_op",
+        "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/kernels:queue_ops",
@@ -3220,6 +3224,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:function_control_ops",
         "//tensorflow/core/kernels:math",
         "//third_party/eigen3",
     ],

From a923f21e8ee3700869f3a57f06c24711ec84721e Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 5 Jul 2018 01:41:20 +0300
Subject: [PATCH 33/64] Common Runtime Fixes in Build for unit testing

---
 tensorflow/core/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6cad165025..581cec8812 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2927,7 +2927,6 @@ tf_cc_test(
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:fifo_queue_op",
         "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:function_control_ops",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:matmul_op",

From 5f463c63d419e1942ff786797a72bd8072ca1217 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 5 Jul 2018 03:40:28 +0300
Subject: [PATCH 34/64] restoring utils/functions.cc - some important code was
 mysteriously deleted leading to   errors in transformation!! ._.

---
 tensorflow/core/grappler/utils/functions.cc | 235 +++++++++++---------
 1 file changed, 124 insertions(+), 111 deletions(-)

diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 37b00e0a30..8e7243ceed 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -26,115 +26,128 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
-namespace grappler {
-
-std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const FunctionDefLibrary& library) {
-  if (func.signature().name().empty()) {
-    LOG(ERROR) << "function name must be specified.";
-    return nullptr;
-  }
-  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-  new_item->id = func.signature().name();
-
-  std::unordered_map<string, string> port_map;
-
-  // Add the function inputs as placeholder
-  for (const auto& inp : func.signature().input_arg()) {
-    NodeDef* ph = new_item->graph.add_node();
-    ph->set_name(inp.name());
-    ph->set_op("Placeholder");
-    if (inp.type() != DT_INVALID) {
-      (*ph->mutable_attr())["T"].set_type(inp.type());
-    } else {
-      auto it = func_attr.find(inp.type_attr());
-      if (it == func_attr.end()) {
-        LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
-                   << " for function input " << inp.name();
-        return nullptr;
-      } else {
-        (*ph->mutable_attr())["T"] = it->second;
-      }
-    }
-    port_map[inp.name()] = inp.name();
-  }
-
-  // Add the function body to the graph.
-  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
-
-  for (const NodeDef& node : func.node_def()) {
-    NodeDef* new_node = new_item->graph.add_node();
-    *new_node = node;
-    // Replace the placeholder attribute values with the specified value.
-    for (auto& attr : *new_node->mutable_attr()) {
-      const string& ph_name = attr.second.placeholder();
-      auto it = func_attr.find(ph_name);
-      if (it != func_attr.end()) {
-        attr.second = it->second;
-      }
-    }
-
-    // Functions use a custom format to encode connectivity. Map these custom
-    // strings to regular ones.
-    const OpRegistrationData* registration;
-    Status status = func_def.LookUp(node.op(), &registration);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
-      return nullptr;
-    }
-
-    tensorflow::NameRangeMap inputs;
-    tensorflow::NameRangeMap outputs;
-    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
-                                           &outputs);
-    if (!status.ok()) {
-      LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
-      return nullptr;
-    }
-    for (const auto& name_range : outputs) {
-      string port_prefix =
-          strings::StrCat(node.name(), ":", name_range.first, ":");
-      int index_start = name_range.second.first;
-      int index_end = name_range.second.second;
-      for (int i = index_start; i < index_end; ++i) {
-        string port_id = strings::StrCat(port_prefix, i - index_start);
-        string port_name = strings::StrCat(node.name(), ":", i);
-        port_map[port_id] = port_name;
-      }
-    }
-  }
-
-  for (auto& node : *new_item->graph.mutable_node()) {
-    // Rewrite the inputs to use the normal naming convention.
-    for (int i = 0; i < node.input_size(); ++i) {
-      const string& input = node.input(i);
-      if (IsControlInput(input)) {
-        // No need to remap control dependencies.
-        continue;
-      } else {
-        auto it = port_map.find(input);
-        if (it == port_map.end()) {
-          LOG(ERROR) << "Unknown input: " << input;
-          return nullptr;
+    namespace grappler {
+
+        std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+                const FunctionDef& func,
+                const std::unordered_map<string, AttrValue>& func_attr,
+                const FunctionDefLibrary& library) {
+          if (func.signature().name().empty()) {
+            LOG(ERROR) << "function name must be specified.";
+            return nullptr;
+          }
+          std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
+          new_item->id = func.signature().name();
+
+          std::unordered_map<string, string> port_map;
+
+          // Add the function inputs as placeholder
+          for (const auto& inp : func.signature().input_arg()) {
+            NodeDef* ph = new_item->graph.add_node();
+            ph->set_name(inp.name());
+            ph->set_op("Placeholder");
+            if (inp.type() != DT_INVALID) {
+              (*ph->mutable_attr())["T"].set_type(inp.type());
+            } else {
+              auto it = func_attr.find(inp.type_attr());
+              if (it == func_attr.end()) {
+                LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
+                           << " for function input " << inp.name();
+                return nullptr;
+              } else {
+                (*ph->mutable_attr())["T"] = it->second;
+              }
+            }
+            port_map[inp.name()] = inp.name();
+          }
+
+          // Add the function body to the graph.
+          FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+
+          for (const NodeDef& node : func.node_def()) {
+            NodeDef* new_node = new_item->graph.add_node();
+            *new_node = node;
+            // Replace the placeholder attribute values with the specified value.
+            for (auto& attr : *new_node->mutable_attr()) {
+              const string& ph_name = attr.second.placeholder();
+              auto it = func_attr.find(ph_name);
+              if (it != func_attr.end()) {
+                attr.second = it->second;
+              }
+            }
+
+            // Functions use a custom format to encode connectivity. Map these custom
+            // strings to regular ones.
+            const OpRegistrationData* registration;
+            Status status = func_def.LookUp(node.op(), &registration);
+            if (!status.ok()) {
+              LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
+              return nullptr;
+            }
+
+            tensorflow::NameRangeMap inputs;
+            tensorflow::NameRangeMap outputs;
+            status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
+                                                   &outputs);
+            if (!status.ok()) {
+              LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
+              return nullptr;
+            }
+            for (const auto& name_range : outputs) {
+              string port_prefix =
+                      strings::StrCat(node.name(), ":", name_range.first, ":");
+              int index_start = name_range.second.first;
+              int index_end = name_range.second.second;
+              for (int i = index_start; i < index_end; ++i) {
+                string port_id = strings::StrCat(port_prefix, i - index_start);
+                string port_name = strings::StrCat(node.name(), ":", i);
+                port_map[port_id] = port_name;
+              }
+            }
+          }
+
+          for (auto& node : *new_item->graph.mutable_node()) {
+            // Rewrite the inputs to use the normal naming convention.
+            for (int i = 0; i < node.input_size(); ++i) {
+              const string& input = node.input(i);
+              if (IsControlInput(input)) {
+                // No need to remap control dependencies.
+                continue;
+              } else {
+                auto it = port_map.find(input);
+                if (it == port_map.end()) {
+                  LOG(ERROR) << "Unknown input: " << input;
+                  return nullptr;
+                }
+                node.set_input(i, it->second);
+              }
+            }
+          }
+
+          // Add the function outputs to the list of fetch nodes, taking into account
+          // the output mapping if any.
+          for (const auto& out : func.signature().output_arg()) {
+            auto it = func.ret().find(out.name());
+            if (it != func.ret().end()) {
+              auto it2 = port_map.find(it->second);
+              if (it2 == port_map.end()) {
+                LOG(ERROR) << "Unknown output mapping: " << it->first << " to "
+                           << it->second;
+                return nullptr;
+              } else {
+                new_item->fetch.emplace_back(it2->second);
+              }
+            } else {
+              new_item->fetch.emplace_back(out.name());
+            }
+          }
+          // Add the function inputs to the list of feeds.
+          for (const auto& inp : func.signature().input_arg()) {
+            new_item->feed.emplace_back(inp.name(), Tensor());
+          }
+
+          return new_item;
         }
-        node.set_input(i, it->second);
-      }
-    }
-  }
-
-  // Add the function outputs to the list of fetch nodes.
-  for (const auto& out : func.signature().output_arg()) {
-    new_item->fetch.emplace_back(out.name());
-  }
-  // Add the function inputs to the list of feeds.
-  for (const auto& inp : func.signature().input_arg()) {
-    new_item->feed.emplace_back(inp.name(), Tensor());
-  }
-
-  return new_item;
-}
-
-}  // end namespace grappler
-}  // end namespace tensorflow
+
+    }  // end namespace grappler
+}  // end namespace tensorflow
\ No newline at end of file

From b59658a0000556c36e95153f6cfd4062b7da8c07 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 5 Jul 2018 05:59:14 +0300
Subject: [PATCH 35/64] Possibly fixed Constant Folding

- Enabled it everywhere, and made it take place after the
  transformation phase. If anything goes south, this one will
  be the first to go in quarantine !!

- Commited also 3 printfs that are quite important during the
  phase of development.
  If the transformation fails, the optimizations will just be ignored
  and the initial graph with the CallOps will be executed instead.
  So, the results will be normal/correct and there will be no way of
  knowing what happened.
  If "Transformation passed successfully!" message gets printed
  then transformation succeeded.
  The 2 other messages that are being printed by the executor repeatedly,
  indicate which node has being processed at that time, and the frame
  in which it belongs.
  Those 3 messages can confirm, for now, that the transformation
  occurred and help in debugging.
---
 tensorflow/core/common_runtime/executor.cc              | 3 +++
 tensorflow/core/common_runtime/graph_execution_state.cc | 1 +
 tensorflow/core/common_runtime/graph_optimizer.cc       | 2 --
 tensorflow/core/graph/graph.h                           | 2 +-
 tensorflow/core/grappler/optimizers/meta_optimizer.cc   | 8 ++++----
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 11476ab89c..5455f69f91 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1984,6 +1984,9 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
   FrameState* output_frame = input_frame;
   int64 output_iter = input_iter;
 
+printf("Propagate Outputs: %s\n", node->name().c_str());
+printf("Frame: %s\n", input_frame->frame_name.c_str());
+
   if (!item->is_enter_exit_or_next_iter && !item->is_call_or_return) {
     // Fast path for nodes types that don't need special handling
     DCHECK_EQ(input_frame, output_frame);
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index e3fadc6a63..772e687187 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -380,6 +380,7 @@ Status GraphExecutionState::OptimizeGraph(
     const void* bf = buf;
     event.set_graph_def(bf, proto_size);
     writer.WriteEvent(event);
+    printf("Transformation passed successfully!\n");
 /*******************************************************************************************/
     // The graph conversion sets the requested device names but not the assigned
     // device names. However, since at this point the graph is placed TF expects
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 367b0315a0..ff99db9532 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -28,8 +28,6 @@ GraphOptimizer::GraphOptimizer(const OptimizerOptions& opts) : opts_(opts) {
   if (opts_.opt_level() >= OptimizerOptions::L1) {
     opts_.set_do_common_subexpression_elimination(true);
     opts_.set_do_constant_folding(true);
-    // set constant folding to false for now; don't know why..
-    opts_.set_do_constant_folding(false);
   }
 }
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 9f5906fd39..226eb775a2 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -159,7 +159,7 @@ class Node {
   bool IsControlFlow() const {
     return (class_ != NC_OTHER) &&  // Fast path
            (IsSwitch() || IsMerge() || IsEnter() || IsExit() ||
-            IsNextIteration());
+            IsNextIteration() || IsCall() || IsReturn());
   }
   bool IsHostSend() const { return class_ == NC_HOST_SEND; }
   bool IsHostRecv() const { return class_ == NC_HOST_RECV; }
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index acf5e258a7..7fd322d6bf 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -66,14 +66,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!cfg_.disable_model_pruning()) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
-    if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
-    }
     if (cfg_.function_transformation() != RewriterConfig::OFF) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new FunctionTransformation()));
     }
+    if (cfg_.constant_folding() != RewriterConfig::OFF) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
+    }
     if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new ArithmeticOptimizer()));

From 747596338de6bf490c093348fbf028822032495a Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 5 Jul 2018 06:30:11 +0300
Subject: [PATCH 36/64] Possibly fixed Constant Folding - should have been
 included in previous commit

---
 tensorflow/core/grappler/optimizers/constant_folding.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index faea843c69..7885facac7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -251,9 +251,10 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (op == "Const") {
     return false;
   }
-  // Skip constrol flow nodes, they can't be folded
+  // Skip control flow nodes, they can't be folded
   if (op == "Enter" || op == "RefEnter" || op == "Exit" || op == "RefExit" ||
-      op == "NextIteration" || op == "RefNextIteration") {
+      op == "NextIteration" || op == "RefNextIteration" ||
+      op == "Call" || op == "RefCall" || op == "Return" || op == "RefReturn") {
     return false;
   }
   if (op.find("Placeholder") == 0) {
@@ -283,7 +284,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (op_def->output_arg_size() == 0) {
     return false;
   }
-
   // No need to (and don't) fold nodes that have no outgoing edges except
   // whitelisted nodes. Such nodes could be introduced by an earlier constant
   // folding pass and are preserved in case users want to fetch their values;

From 2aeca949266893cbd9d948f8bf0cfa7626f32227 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 5 Jul 2018 06:31:35 +0300
Subject: [PATCH 37/64] Deleting copyrights

---
 .../grappler/optimizers/function_transformation.cc | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 588f70c404..b3f5f97257 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -1,17 +1,3 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_transformation.h"
 #include <set>

From 99d3cc2ca977da5d8c8557402bc6c4c80162e0eb Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Thu, 5 Jul 2018 06:32:24 +0300
Subject: [PATCH 38/64] Enabled Topological Sort

---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 2 +-
 tensorflow/core/grappler/utils/topological_sort.cc    | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 7fd322d6bf..cf0345ce8e 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -126,7 +126,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           optimizer->Optimize(cluster, optimized_item, optimized_graph));
     }
   }
-  //TopologicalSort(optimized_graph);
+  TopologicalSort(optimized_graph);
 
   // Make sure that the optimizers preserved the graph version and library.
   DCHECK_GE(optimized_graph->library().function_size(),
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 77d4702d21..0e2d408a0b 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -41,10 +41,16 @@ void TopologicalSort(GraphDef* graph) {
     if (IsMerge(*node)) {
       ready_inputs[node] = 0;
       for (const auto& input : node->input()) {
-        if (IsNextIteration(*output_map.GetNode(input))) {
+        if (IsNextIteration(*output_map.GetNode(input)) ||
+                IsCall(*output_map.GetNode(input))) {
           ready_inputs[node]++;
         }
       }
+    } else if (IsReturn(*node)) {
+      // We need a better condition for Return Cycles as this one allows non recursive Returns
+      // -which do not create cycles at all- to enter the "ready_nodes" before their actual time comes.
+      ready_inputs[node] = 1;
+
     } else {
       ready_inputs[node] = 0;
     }

From 7e5162800ef30796ce1bfbce43ee122ba87f21e9 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Thu, 5 Jul 2018 14:12:17 +0300
Subject: [PATCH 39/64] Change a condition in executor to the equivalent IsCall

---
 tensorflow/core/common_runtime/executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 5455f69f91..8645836791 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2369,7 +2369,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
   int parallel_iters = 1;
-  if (node->op_def().name() != "Call") {
+  if (!IsCall(node)) {
     s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
     DCHECK(s.ok()) << s;
   }

From 896846f9a1bf7bee907f89c116d83689ddfd2e0e Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Thu, 5 Jul 2018 14:16:20 +0300
Subject: [PATCH 40/64] Add license preamble to new files

I think we still want to distribute our contribution under the
Apache License, so it can be easier for other to adopt it.
---
 .../grappler/optimizers/function_transformation.cc   | 12 ++++++++++++
 .../grappler/optimizers/function_transformation.h    |  4 +---
 tensorflow/core/kernels/function_control_ops.cc      | 12 ++++++++++++
 tensorflow/core/kernels/function_control_ops.h       | 12 ++++++++++++
 tensorflow/core/ops/function_control_ops.cc          | 12 ++++++++++++
 5 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index b3f5f97257..23095c794d 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -1,3 +1,15 @@
+/* Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_transformation.h"
 #include <set>
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.h b/tensorflow/core/grappler/optimizers/function_transformation.h
index 245b37e369..8436b751bb 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.h
+++ b/tensorflow/core/grappler/optimizers/function_transformation.h
@@ -1,6 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
+/* Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
diff --git a/tensorflow/core/kernels/function_control_ops.cc b/tensorflow/core/kernels/function_control_ops.cc
index 9d190e324b..e3b5749c14 100644
--- a/tensorflow/core/kernels/function_control_ops.cc
+++ b/tensorflow/core/kernels/function_control_ops.cc
@@ -1,3 +1,15 @@
+/* Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 #include "tensorflow/core/kernels/function_control_ops.h"
 
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/function_control_ops.h b/tensorflow/core/kernels/function_control_ops.h
index 62aaada374..2d528ad56a 100644
--- a/tensorflow/core/kernels/function_control_ops.h
+++ b/tensorflow/core/kernels/function_control_ops.h
@@ -1,3 +1,15 @@
+/* Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 #ifndef TENSORFLOW_KERNELS_FUNCTION_CONTROL_OPS_H_
 #define TENSORFLOW_KERNELS_FUNCTION_CONTROL_OPS_H_
 
diff --git a/tensorflow/core/ops/function_control_ops.cc b/tensorflow/core/ops/function_control_ops.cc
index 731fc26963..829b7e5cd9 100644
--- a/tensorflow/core/ops/function_control_ops.cc
+++ b/tensorflow/core/ops/function_control_ops.cc
@@ -1,3 +1,15 @@
+/* Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"

From 55603da582b3055e52a7946f094ed0d1f70dc184 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Thu, 5 Jul 2018 16:45:12 +0300
Subject: [PATCH 41/64] Use functions.cc from commit 7013a5

---
 tensorflow/core/grappler/utils/functions.cc | 248 ++++++++++----------
 1 file changed, 124 insertions(+), 124 deletions(-)

diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 8e7243ceed..4f286ce1c8 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -26,128 +26,128 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
-    namespace grappler {
-
-        std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-                const FunctionDef& func,
-                const std::unordered_map<string, AttrValue>& func_attr,
-                const FunctionDefLibrary& library) {
-          if (func.signature().name().empty()) {
-            LOG(ERROR) << "function name must be specified.";
-            return nullptr;
-          }
-          std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-          new_item->id = func.signature().name();
-
-          std::unordered_map<string, string> port_map;
-
-          // Add the function inputs as placeholder
-          for (const auto& inp : func.signature().input_arg()) {
-            NodeDef* ph = new_item->graph.add_node();
-            ph->set_name(inp.name());
-            ph->set_op("Placeholder");
-            if (inp.type() != DT_INVALID) {
-              (*ph->mutable_attr())["T"].set_type(inp.type());
-            } else {
-              auto it = func_attr.find(inp.type_attr());
-              if (it == func_attr.end()) {
-                LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
-                           << " for function input " << inp.name();
-                return nullptr;
-              } else {
-                (*ph->mutable_attr())["T"] = it->second;
-              }
-            }
-            port_map[inp.name()] = inp.name();
-          }
-
-          // Add the function body to the graph.
-          FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
-
-          for (const NodeDef& node : func.node_def()) {
-            NodeDef* new_node = new_item->graph.add_node();
-            *new_node = node;
-            // Replace the placeholder attribute values with the specified value.
-            for (auto& attr : *new_node->mutable_attr()) {
-              const string& ph_name = attr.second.placeholder();
-              auto it = func_attr.find(ph_name);
-              if (it != func_attr.end()) {
-                attr.second = it->second;
-              }
-            }
-
-            // Functions use a custom format to encode connectivity. Map these custom
-            // strings to regular ones.
-            const OpRegistrationData* registration;
-            Status status = func_def.LookUp(node.op(), &registration);
-            if (!status.ok()) {
-              LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
-              return nullptr;
-            }
-
-            tensorflow::NameRangeMap inputs;
-            tensorflow::NameRangeMap outputs;
-            status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
-                                                   &outputs);
-            if (!status.ok()) {
-              LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
-              return nullptr;
-            }
-            for (const auto& name_range : outputs) {
-              string port_prefix =
-                      strings::StrCat(node.name(), ":", name_range.first, ":");
-              int index_start = name_range.second.first;
-              int index_end = name_range.second.second;
-              for (int i = index_start; i < index_end; ++i) {
-                string port_id = strings::StrCat(port_prefix, i - index_start);
-                string port_name = strings::StrCat(node.name(), ":", i);
-                port_map[port_id] = port_name;
-              }
-            }
-          }
-
-          for (auto& node : *new_item->graph.mutable_node()) {
-            // Rewrite the inputs to use the normal naming convention.
-            for (int i = 0; i < node.input_size(); ++i) {
-              const string& input = node.input(i);
-              if (IsControlInput(input)) {
-                // No need to remap control dependencies.
-                continue;
-              } else {
-                auto it = port_map.find(input);
-                if (it == port_map.end()) {
-                  LOG(ERROR) << "Unknown input: " << input;
-                  return nullptr;
-                }
-                node.set_input(i, it->second);
-              }
-            }
-          }
-
-          // Add the function outputs to the list of fetch nodes, taking into account
-          // the output mapping if any.
-          for (const auto& out : func.signature().output_arg()) {
-            auto it = func.ret().find(out.name());
-            if (it != func.ret().end()) {
-              auto it2 = port_map.find(it->second);
-              if (it2 == port_map.end()) {
-                LOG(ERROR) << "Unknown output mapping: " << it->first << " to "
-                           << it->second;
-                return nullptr;
-              } else {
-                new_item->fetch.emplace_back(it2->second);
-              }
-            } else {
-              new_item->fetch.emplace_back(out.name());
-            }
-          }
-          // Add the function inputs to the list of feeds.
-          for (const auto& inp : func.signature().input_arg()) {
-            new_item->feed.emplace_back(inp.name(), Tensor());
-          }
-
-          return new_item;
+namespace grappler {
+
+std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionDefLibrary& library) {
+  if (func.signature().name().empty()) {
+    LOG(ERROR) << "function name must be specified.";
+    return nullptr;
+  }
+  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
+  new_item->id = func.signature().name();
+
+  std::unordered_map<string, string> port_map;
+
+  // Add the function inputs as placeholder
+  for (const auto& inp : func.signature().input_arg()) {
+    NodeDef* ph = new_item->graph.add_node();
+    ph->set_name(inp.name());
+    ph->set_op("Placeholder");
+    if (inp.type() != DT_INVALID) {
+      (*ph->mutable_attr())["T"].set_type(inp.type());
+    } else {
+      auto it = func_attr.find(inp.type_attr());
+      if (it == func_attr.end()) {
+        LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
+                   << " for function input " << inp.name();
+        return nullptr;
+      } else {
+        (*ph->mutable_attr())["T"] = it->second;
+      }
+    }
+    port_map[inp.name()] = inp.name();
+  }
+
+  // Add the function body to the graph.
+  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+
+  for (const NodeDef& node : func.node_def()) {
+    NodeDef* new_node = new_item->graph.add_node();
+    *new_node = node;
+    // Replace the placeholder attribute values with the specified value.
+    for (auto& attr : *new_node->mutable_attr()) {
+      const string& ph_name = attr.second.placeholder();
+      auto it = func_attr.find(ph_name);
+      if (it != func_attr.end()) {
+        attr.second = it->second;
+      }
+    }
+
+    // Functions use a custom format to encode connectivity. Map these custom
+    // strings to regular ones.
+    const OpRegistrationData* registration;
+    Status status = func_def.LookUp(node.op(), &registration);
+    if (!status.ok()) {
+      LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
+      return nullptr;
+    }
+
+    tensorflow::NameRangeMap inputs;
+    tensorflow::NameRangeMap outputs;
+    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
+                                           &outputs);
+    if (!status.ok()) {
+      LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
+      return nullptr;
+    }
+    for (const auto& name_range : outputs) {
+      string port_prefix =
+          strings::StrCat(node.name(), ":", name_range.first, ":");
+      int index_start = name_range.second.first;
+      int index_end = name_range.second.second;
+      for (int i = index_start; i < index_end; ++i) {
+        string port_id = strings::StrCat(port_prefix, i - index_start);
+        string port_name = strings::StrCat(node.name(), ":", i);
+        port_map[port_id] = port_name;
+      }
+    }
+  }
+
+  for (auto& node : *new_item->graph.mutable_node()) {
+    // Rewrite the inputs to use the normal naming convention.
+    for (int i = 0; i < node.input_size(); ++i) {
+      const string& input = node.input(i);
+      if (IsControlInput(input)) {
+        // No need to remap control dependencies.
+        continue;
+      } else {
+        auto it = port_map.find(input);
+        if (it == port_map.end()) {
+          LOG(ERROR) << "Unknown input: " << input;
+          return nullptr;
         }
-
-    }  // end namespace grappler
-}  // end namespace tensorflow
\ No newline at end of file
+        node.set_input(i, it->second);
+      }
+    }
+  }
+
+  // Add the function outputs to the list of fetch nodes, taking into account
+  // the output mapping if any.
+  for (const auto& out : func.signature().output_arg()) {
+    auto it = func.ret().find(out.name());
+    if (it != func.ret().end()) {
+      auto it2 = port_map.find(it->second);
+      if (it2 == port_map.end()) {
+        LOG(ERROR) << "Unknown output mapping: " << it->first << " to "
+                   << it->second;
+        return nullptr;
+      } else {
+        new_item->fetch.emplace_back(it2->second);
+      }
+    } else {
+      new_item->fetch.emplace_back(out.name());
+    }
+  }
+  // Add the function inputs to the list of feeds.
+  for (const auto& inp : func.signature().input_arg()) {
+    new_item->feed.emplace_back(inp.name(), Tensor());
+  }
+
+  return new_item;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow

From 6d0f462aecb4cf9497123768ec7eb006241eceb4 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Fri, 6 Jul 2018 15:05:59 +0300
Subject: [PATCH 42/64] Some Test Cases

---
 TESTS/2DimensionOutput.py        | 29 +++++++++++++++++++++++++++++
 TESTS/{callg.py => fcallsg.py}   |  0
 TESTS/ff.py                      | 27 +++++++++++++++++++++++++++
 TESTS/{fib.py => fibonacci.py}   |  0
 TESTS/{func.py => funcSimple.py} |  0
 TESTS/not_lazy.py                | 29 +++++++++++++++++++++++++++++
 TESTS/rec.py                     | 25 -------------------------
 7 files changed, 85 insertions(+), 25 deletions(-)
 create mode 100644 TESTS/2DimensionOutput.py
 rename TESTS/{callg.py => fcallsg.py} (100%)
 create mode 100644 TESTS/ff.py
 rename TESTS/{fib.py => fibonacci.py} (100%)
 rename TESTS/{func.py => funcSimple.py} (100%)
 create mode 100644 TESTS/not_lazy.py
 delete mode 100644 TESTS/rec.py

diff --git a/TESTS/2DimensionOutput.py b/TESTS/2DimensionOutput.py
new file mode 100644
index 0000000000..3ade99220d
--- /dev/null
+++ b/TESTS/2DimensionOutput.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+fac = function.Declare("Fac", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="Fac", out_names=["ret"])
+def FacImpl(n):
+	return tf.cond(tf.less_equal(n, 1),
+		lambda: tf.constant([1,1]),
+		lambda: [n,n]*fac(n-1))
+
+
+FacImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+x = tf.add(n, 1)
+result = fac(x)
+y = tf.add(result, [1,1])
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+print(sess.run(y, feed_dict={n: 5}))
+
+writer.close()
+
+sess.close()
diff --git a/TESTS/callg.py b/TESTS/fcallsg.py
similarity index 100%
rename from TESTS/callg.py
rename to TESTS/fcallsg.py
diff --git a/TESTS/ff.py b/TESTS/ff.py
new file mode 100644
index 0000000000..2915db693e
--- /dev/null
+++ b/TESTS/ff.py
@@ -0,0 +1,27 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+fac = function.Declare("Fac", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="Fac", out_names=["ret"])
+def FacImpl(n):
+	return fac(n)
+
+
+FacImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+x = tf.add(n, 1)
+result = fac(x)
+#y = tf.add(result, 1)
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+print(sess.run(result, feed_dict={n: 5}))
+
+writer.close()
+
+sess.close()
diff --git a/TESTS/fib.py b/TESTS/fibonacci.py
similarity index 100%
rename from TESTS/fib.py
rename to TESTS/fibonacci.py
diff --git a/TESTS/func.py b/TESTS/funcSimple.py
similarity index 100%
rename from TESTS/func.py
rename to TESTS/funcSimple.py
diff --git a/TESTS/not_lazy.py b/TESTS/not_lazy.py
new file mode 100644
index 0000000000..b694c1584e
--- /dev/null
+++ b/TESTS/not_lazy.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+fac = function.Declare("Fac", [("x", tf.int32), ("y", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, tf.int32, func_name="Fac", out_names=["ret"])
+def FacImpl(x, y):
+	return tf.cond(tf.less_equal(x, 1),
+		lambda: tf.constant(1),
+		lambda: fac(x-1, fac(x,y)))
+
+FacImpl.add_to_graph(tf.get_default_graph())
+
+x = tf.placeholder(tf.int32, shape=[])
+result = fac(x, 2)
+
+
+y = tf.add(result, 1)
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+print(sess.run(y, feed_dict={x:2}))
+
+writer.close()
+
+sess.close()
diff --git a/TESTS/rec.py b/TESTS/rec.py
deleted file mode 100644
index a0afc23f22..0000000000
--- a/TESTS/rec.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.framework import function
-
-fib = function.Declare("Fib", [("n", tf.int32)], [("ret", tf.int32)])
-
-@function.Defun(tf.int32, func_name="Fib", out_names=["ret"])
-def FibImpl(n):
-	return tf.cond(tf.less_equal(n, 1),
-		lambda: tf.constant(1),
-		lambda: fib(n - 1))
-# + fib(n - 2))
-
-FibImpl.add_to_graph(tf.get_default_graph())
-
-n = tf.placeholder(tf.int32, shape=[])
-result = fib(n)
-
-writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
-
-sess = tf.Session()
-print(sess.run(result, feed_dict={n: 2}))
-
-writer.close()
-
-sess.close()

From 92bcceba496af323ba7cb5f6d0f6bf8e8d1d3ae4 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Fri, 6 Jul 2018 16:17:56 +0300
Subject: [PATCH 43/64] Fixed Topological Sort

---
 .../core/grappler/utils/topological_sort.cc   | 41 ++++++++++++++++---
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 0e2d408a0b..7df179a361 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <deque>
 #include <unordered_map>
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 
@@ -31,7 +32,9 @@ void TopologicalSort(GraphDef* graph) {
   ready_nodes.reserve(graph->node_size());
   int front = 0;
   int back = 0;
+  std::set<NodeDef*> merge_nodes;
   std::unordered_map<const NodeDef*, int> ready_inputs;
+  std::unordered_map<const NodeDef*, std::set<string>> returning_nodes;
   for (int i = 0; i < graph->node_size(); i++) {
     auto node = graph->mutable_node(i);
     if (node->input_size() == 0) {
@@ -41,21 +44,49 @@ void TopologicalSort(GraphDef* graph) {
     if (IsMerge(*node)) {
       ready_inputs[node] = 0;
       for (const auto& input : node->input()) {
-        if (IsNextIteration(*output_map.GetNode(input)) ||
-                IsCall(*output_map.GetNode(input))) {
+        if (IsNextIteration(*output_map.GetNode(input))) {
           ready_inputs[node]++;
         }
+        else if (IsCall(*output_map.GetNode(input))) {
+          // We don't want to increase merge's ready_inputs
+          // every time we meet a Call input. Just Once.
+          merge_nodes.emplace(node);
+        }
       }
     } else if (IsReturn(*node)) {
-      // We need a better condition for Return Cycles as this one allows non recursive Returns
-      // -which do not create cycles at all- to enter the "ready_nodes" before their actual time comes.
-      ready_inputs[node] = 1;
+      // Nodes that send their output to "Return" nodes are
+      // function Returning Nodes and in case of recursive functions
+      // those nodes are part of graph cycles.
+      for (const auto& input : node->input()) {
+        NodeDef *prevNode = output_map.GetNode(input);
+        // In order to detect the recursion cycles we depend on
+        // the fact that a recursive function's returning node,
+        // will be sending outputs to at least 2 "Return" nodes
+        // with different "frame_name" attributes (same "frame_name"
+        // attrs would mean that they belong in the same function call
+        // but they correspond to different function outputs)
+        string frame_name;
+        GetNodeAttr(AttrSlice(*node), "frame_name", &frame_name);
+        returning_nodes[prevNode].emplace(frame_name);
+      }
+      ready_inputs[node] = 0;
 
     } else {
       ready_inputs[node] = 0;
     }
   }
 
+  for (const auto& merge : merge_nodes) {
+      ready_inputs[merge]++;
+  }
+
+  for (const auto& retnode : returning_nodes) {
+    if (retnode.second.size() > 1) {
+      // Detected Cycle
+      ready_inputs[retnode.first]++;
+    }
+  }
+
   while (front != back) {
     auto ready_node = ready_nodes[front];
     for (const auto& fanout_pair : output_map.GetOutputs(ready_node->name())) {

From b95938d26f1bdfe5560d8ef731378cbd32c8e86e Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Fri, 6 Jul 2018 16:32:09 +0300
Subject: [PATCH 44/64] Mini optimization in Topological Sort

---
 tensorflow/core/grappler/utils/topological_sort.cc | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 7df179a361..5812c14c90 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -32,7 +32,6 @@ void TopologicalSort(GraphDef* graph) {
   ready_nodes.reserve(graph->node_size());
   int front = 0;
   int back = 0;
-  std::set<NodeDef*> merge_nodes;
   std::unordered_map<const NodeDef*, int> ready_inputs;
   std::unordered_map<const NodeDef*, std::set<string>> returning_nodes;
   for (int i = 0; i < graph->node_size(); i++) {
@@ -50,7 +49,7 @@ void TopologicalSort(GraphDef* graph) {
         else if (IsCall(*output_map.GetNode(input))) {
           // We don't want to increase merge's ready_inputs
           // every time we meet a Call input. Just Once.
-          merge_nodes.emplace(node);
+          ready_inputs[node] = 1;
         }
       }
     } else if (IsReturn(*node)) {
@@ -76,10 +75,6 @@ void TopologicalSort(GraphDef* graph) {
     }
   }
 
-  for (const auto& merge : merge_nodes) {
-      ready_inputs[merge]++;
-  }
-
   for (const auto& retnode : returning_nodes) {
     if (retnode.second.size() > 1) {
       // Detected Cycle

From 0d193af7d0a91216df4073693b218fac4147e9d8 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Tue, 10 Jul 2018 20:33:03 +0300
Subject: [PATCH 45/64] Renamed CallOp class

---
 TESTS/fibonacci.py                            |  2 +-
 .../core/kernels/function_control_ops.cc      | 22 +++++++++----------
 .../core/kernels/function_control_ops.h       |  8 +++----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/TESTS/fibonacci.py b/TESTS/fibonacci.py
index c6b4e4e9c1..680f8be425 100644
--- a/TESTS/fibonacci.py
+++ b/TESTS/fibonacci.py
@@ -24,6 +24,6 @@ def FibImpl(n):
 
 
 writer.close()
-print(sess.run(res, feed_dict={n: 5}))
+print(sess.run(res, feed_dict={n: 24}))
 
 sess.close()
diff --git a/tensorflow/core/kernels/function_control_ops.cc b/tensorflow/core/kernels/function_control_ops.cc
index e3b5749c14..a22c079102 100644
--- a/tensorflow/core/kernels/function_control_ops.cc
+++ b/tensorflow/core/kernels/function_control_ops.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-void CallOp::Compute(OpKernelContext* context) {
+void CallOpe::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
     context->forward_ref_input_to_ref_output(0, 0);
   } else {
@@ -28,15 +28,15 @@ void CallOp::Compute(OpKernelContext* context) {
   }
 }
 
-REGISTER_KERNEL_BUILDER(Name("Call").Device(DEVICE_CPU), CallOp);
-REGISTER_KERNEL_BUILDER(Name("RefCall").Device(DEVICE_CPU), CallOp);
+REGISTER_KERNEL_BUILDER(Name("Call").Device(DEVICE_CPU), CallOpe);
+REGISTER_KERNEL_BUILDER(Name("RefCall").Device(DEVICE_CPU), CallOpe);
 
 #define REGISTER_GPU_KERNEL(type) \
   REGISTER_KERNEL_BUILDER(        \
-      Name("Call").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
+      Name("Call").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOpe)
 #define REGISTER_GPU_REF_KERNEL(type) \
   REGISTER_KERNEL_BUILDER(            \
-      Name("RefCall").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOp)
+      Name("RefCall").Device(DEVICE_GPU).TypeConstraint<type>("T"), CallOpe)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
@@ -49,13 +49,13 @@ REGISTER_GPU_REF_KERNEL(bool);
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)  \
   REGISTER_KERNEL_BUILDER(          \
-      Name("Call").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
+      Name("Call").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOpe)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
 #define REGISTER_SYCL_REF_KERNEL(type)  \
   REGISTER_KERNEL_BUILDER(              \
-      Name("RefCall").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOp)
+      Name("RefCall").Device(DEVICE_SYCL).TypeConstraint<type>("T"), CallOpe)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
@@ -67,7 +67,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
                               .HostMemory("data")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          CallOp)
+                          CallOpe)
 
 #define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
   REGISTER_KERNEL_BUILDER(Name("RefCall")                \
@@ -75,7 +75,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
                               .HostMemory("data")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          CallOp)
+                          CallOpe)
 
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
@@ -93,7 +93,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
                               .HostMemory("data")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          CallOp)
+                          CallOpe)
 
 #define REGISTER_GPU_HOST_REF_KERNEL(type)                \
   REGISTER_KERNEL_BUILDER(Name("RefCall")                \
@@ -101,7 +101,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
                               .HostMemory("data")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          CallOp)
+                          CallOpe)
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_REF_KERNEL(int32);
diff --git a/tensorflow/core/kernels/function_control_ops.h b/tensorflow/core/kernels/function_control_ops.h
index 2d528ad56a..b03d3eae9a 100644
--- a/tensorflow/core/kernels/function_control_ops.h
+++ b/tensorflow/core/kernels/function_control_ops.h
@@ -20,14 +20,14 @@ namespace tensorflow {
 // A call op has one input and one output. It creates or finds
 // the child frame that is uniquely identified by the frame_name,
 // and makes its input available to the child frame.
-class CallOp : public OpKernel {
+class CallOpe : public OpKernel {
 public:
-    explicit CallOp(OpKernelConstruction* context) : OpKernel(context) {}
+    explicit CallOpe(OpKernelConstruction* context) : OpKernel(context) {}
     void Compute(OpKernelContext* context) override;
     bool IsExpensive() override { return false; }
-    ~CallOp() override {}
+    ~CallOpe() override {}
 
-    TF_DISALLOW_COPY_AND_ASSIGN(CallOp);
+    TF_DISALLOW_COPY_AND_ASSIGN(CallOpe);
 };
 
 // A Return op has one input and one output. It exits the current

From 6529bbc0e8cf5dff8cae74e407f9989ca5fba57a Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Wed, 11 Jul 2018 07:19:58 +0300
Subject: [PATCH 46/64] Add Ackermann function as test

---
 TESTS/ackermann.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 TESTS/ackermann.py

diff --git a/TESTS/ackermann.py b/TESTS/ackermann.py
new file mode 100644
index 0000000000..bac81b2088
--- /dev/null
+++ b/TESTS/ackermann.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+ack = function.Declare("ack", [("n", tf.int32), ("m", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, tf.int32, func_name="Ack", out_names=["ret"])
+def AckImpl(n,m):
+	return tf.cond(tf.equal(m, 0),
+        lambda: n + 1,
+        tf.cond(tf.equals(n, 0),
+            lambda: ack(m-1,1),
+            lambda: ack(m-1,ack(m,n-1))))
+
+AckImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+m = tf.placeholder(tf.int32, shape=[])
+res = ack(n,m)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer.close()
+print(sess.run(res, feed_dict={n:2, m:3}))
+
+sess.close()

From c953c6a7c5e4de4e87ff9ba1256c6942a4ac200e Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Wed, 11 Jul 2018 14:51:41 +0300
Subject: [PATCH 47/64] Add some more classic benchmarks

---
 TESTS/primes.py   | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 TESTS/takeuchi.py | 28 +++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 TESTS/primes.py
 create mode 100644 TESTS/takeuchi.py

diff --git a/TESTS/primes.py b/TESTS/primes.py
new file mode 100644
index 0000000000..7668c983fa
--- /dev/null
+++ b/TESTS/primes.py
@@ -0,0 +1,62 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+primes = function.Declare("primes", [("x", tf.int32)], [("ret", tf.int32)])
+findPrimePlus  = function.Declare("findPrimePlus",  [("n", tf.int32),("i", tf.int32)], [("ret", tf.int32)])
+findPrimeMinus = function.Declare("findPrimeMinus", [("n", tf.int32),("i", tf.int32)], [("ret", tf.int32)])
+testPrime      = function.Declare("testPrime",      [("n", tf.int32),("i", tf.int32)], [("ret", tf.bool)])
+
+
+@function.Defun(tf.int32, func_name="primes", out_names=["ret"])
+def PrimesImpl(n):
+	return tf.cond(tf.less_equal(n, 0),
+        lambda: 2,
+		lambda: tf.cond(tf.equal(n, 1),
+					lambda: 3,
+					lambda: findPrimeMinus(n-2,1)
+					))
+PrimesImpl.add_to_graph(tf.get_default_graph())
+
+@function.Defun(tf.int32, tf.int32, func_name="findPrimeMinus", out_names=["ret"])
+def FindPrimeMinusImpl(n,i):
+	return tf.cond(testPrime(6*i-1, 1),
+        lambda: tf.cond(tf.equal(n, 0),
+		             lambda: 6*i-1,
+					 lambda: findPrimePlus(n-1,i)),
+		lambda: findPrimePlus(n,i))
+FindPrimeMinusImpl.add_to_graph(tf.get_default_graph())
+
+@function.Defun(tf.int32, tf.int32, func_name="findPrimePlus", out_names=["ret"])
+def FindPrimePlusImpl(n,i):
+	return tf.cond(testPrime(6*i-1, 1),
+        lambda: tf.cond(tf.equal(n, 0),
+		             lambda: 6*i-1,
+					 lambda: findPrimMinus(n-1,i+1)),
+		lambda: findPrimeMinus(n,i+1))
+FindPrimePlusImpl.add_to_graph(tf.get_default_graph())
+
+
+@function.Defun(tf.int32, tf.int32, func_name="testPrime", out_names=["ret"])
+def TestPrimeImpl(n,i):
+	return tf.cond(tf.greater((6*i-1)*(6*i-1), n),
+				lambda: True,
+				lambda: tf.cond(tf.equal(tf.mod(n, (6*i-1)), 0),
+							lambda: False,
+							lambda: tf.cond(tf.equal(tf.mod(n, (6*i-1)), 0),
+										lambda: False,
+										lambda: testPrime(n, i+1))))
+TestPrimeImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+res = primes(n)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer.close()
+print(sess.run(res, feed_dict={n:7500}))
+
+sess.close()
diff --git a/TESTS/takeuchi.py b/TESTS/takeuchi.py
new file mode 100644
index 0000000000..a90e1b78a6
--- /dev/null
+++ b/TESTS/takeuchi.py
@@ -0,0 +1,28 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+ack = function.Declare("tak", [("x", tf.int32), ("y", tf.int32), ("z", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, tf.int32, tf.int32, func_name="Tak", out_names=["ret"])
+def TakImpl(x,y,z):
+	return tf.cond(tf.less(y, x),
+        lambda: tak(tak(x-1,y,z), tak(y-1,z,x), tak(z-1,x,y))
+		lambda: z)
+
+TakImpl.add_to_graph(tf.get_default_graph())
+
+x = tf.placeholder(tf.int32, shape=[])
+y = tf.placeholder(tf.int32, shape=[])
+z = tf.placeholder(tf.int32, shape=[])
+res = tak(x,y,z)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+sess = tf.Session()
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer.close()
+print(sess.run(res, feed_dict={x:24, y:16, z:8}))
+
+sess.close()

From 8cf77f7cb61244151af30bd7899e588d4186599f Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Fri, 13 Jul 2018 07:40:42 +0300
Subject: [PATCH 48/64] Change indentation in function trans

---
 .../optimizers/function_transformation.cc     | 182 ++++++++----------
 .../optimizers/function_transformation.h      |  32 +--
 2 files changed, 99 insertions(+), 115 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 23095c794d..dab1c2828e 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -35,62 +35,62 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-  class FunctionInliningContext {
-    public:
-        explicit FunctionInliningContext(const GrapplerItem& item)
-                : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
-
-        const FunctionDefLibrary& Library() const { return *library_; }
-
-        bool HasInlinedFunctions() const { return !functions_.empty(); }
-
-        // Find inlining candidate by name. Return nullptr if not found.
-        const FunctionDef* FindInlinedFunction(const string& name) const {
-          auto it = functions_.find(name);
-          if (it != functions_.end()) {
-            return it->second;
-          } else {
-            return nullptr;
-          }
-        }
+class FunctionInliningContext {
+public:
+    explicit FunctionInliningContext(const GrapplerItem& item)
+            : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
+
+    const FunctionDefLibrary& Library() const { return *library_; }
+
+    bool HasInlinedFunctions() const { return !functions_.empty(); }
+
+    // Find inlining candidate by name. Return nullptr if not found.
+    const FunctionDef* FindInlinedFunction(const string& name) const {
+      auto it = functions_.find(name);
+      if (it != functions_.end()) {
+        return it->second;
+      } else {
+        return nullptr;
+      }
+    }
 
-    private:
-        std::unordered_map<string, const FunctionDef*> InliningCandidates(const GrapplerItem& item) const {
-          std::unordered_map<string, const FunctionDef*> functions;
-          for (const FunctionDef& func : item.graph.library().function()) {
-            // Don't inline functions marked as noinline
+private:
+    std::unordered_map<string, const FunctionDef*> InliningCandidates(const GrapplerItem& item) const {
+      std::unordered_map<string, const FunctionDef*> functions;
+      for (const FunctionDef& func : item.graph.library().function()) {
+        // Don't inline functions marked as noinline
 //                    if (func.attr().count("_noinline") != 0) {
 //                      continue;
 //                    }
-            // Don't touch anything marked XLA to prevent XLA failures further down
-            // the road.
-            if (func.attr().count("_XlaCompile") > 0 &&
-                func.attr().at("_XlaCompile").b()) {
-              continue;
-            }
-            // Can't create IdentityN nodes with no input or output: skip these
-            // functions for now.
-            if (func.signature().input_arg_size() == 0 ||
-                func.signature().output_arg_size() == 0) {
-              continue;
-            }
-            functions[func.signature().name()] = &func;
-          }
-          return functions;
+        // Don't touch anything marked XLA to prevent XLA failures further down
+        // the road.
+        if (func.attr().count("_XlaCompile") > 0 &&
+            func.attr().at("_XlaCompile").b()) {
+          continue;
+        }
+        // Can't create IdentityN nodes with no input or output: skip these
+        // functions for now.
+        if (func.signature().input_arg_size() == 0 ||
+            func.signature().output_arg_size() == 0) {
+          continue;
         }
+        functions[func.signature().name()] = &func;
+      }
+      return functions;
+    }
 
-        const FunctionDefLibrary* library_;
-        std::unordered_map<string, const FunctionDef*> functions_;
+    const FunctionDefLibrary* library_;
+    std::unordered_map<string, const FunctionDef*> functions_;
 
-        TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
-  };
+    TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+};
 
-  // Copy input/output argument type to the type. Return error if argument
-  // type is not explicitly defined, and not specified in function attributes.
-  Status CopyArgType(const NodeDef& func_node,
-                     const std::unordered_map<string, AttrValue>& func_attr,
-                     const string& arg_kind, const OpDef::ArgDef& arg,
-                     DataType* type) {
+// Copy input/output argument type to the type. Return error if argument
+// type is not explicitly defined, and not specified in function attributes.
+Status CopyArgType(const NodeDef& func_node,
+                 const std::unordered_map<string, AttrValue>& func_attr,
+                 const string& arg_kind, const OpDef::ArgDef& arg,
+                 DataType* type) {
     if (arg.type() != DT_INVALID) {
       *type = arg.type();
     } else {
@@ -103,14 +103,14 @@ namespace {
       *type = it->second.type();
     }
     return Status::OK();
-  }
-
-  // Copy input/output argument type to the type_list. Return error if argument
-  // type is not explicitly defined, and not specified in function attributes.
-  Status CopyArgTypeN(const NodeDef& func_node,
-                     const std::unordered_map<string, AttrValue>& func_attr,
-                     const string& arg_kind, const OpDef::ArgDef& arg,
-                     AttrValue::ListValue* type_list) {
+}
+
+// Copy input/output argument type to the type_list. Return error if argument
+// type is not explicitly defined, and not specified in function attributes.
+Status CopyArgTypeN(const NodeDef& func_node,
+                 const std::unordered_map<string, AttrValue>& func_attr,
+                 const string& arg_kind, const OpDef::ArgDef& arg,
+                 AttrValue::ListValue* type_list) {
     if (arg.type() != DT_INVALID) {
       type_list->add_type(arg.type());
     } else {
@@ -123,10 +123,9 @@ namespace {
       type_list->add_type(it->second.type());
     }
     return Status::OK();
-  }
-
-  string ParseString(string input) {
+}
 
+string ParseString(string input) {
     size_t pos = 0;
     std::string res = "";
     std::string delimiter = ":";
@@ -140,14 +139,13 @@ namespace {
       res = input + "/Ret0";
     }
     return res;
-  }
+}
 
-  Status GatherOutputs(std::set<string> &foutputs, const GrapplerItem& item,
-                       const FunctionInliningContext& function_inlining_ctx) {
+Status GatherOutputs(std::set<string> &foutputs, const GrapplerItem& item,
+                   const FunctionInliningContext& function_inlining_ctx) {
     for (const NodeDef& node : item.graph.node()) {
       const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
       if (func != nullptr) {      // If it's a function calling node
-
         for (int i = 0; i < func->signature().output_arg_size(); ++i) {
          // const OpDef::ArgDef &arg = func->signature().output_arg(i);
           foutputs.emplace(node.name());                   // Fac
@@ -157,10 +155,10 @@ namespace {
       }
     }
     return Status::OK();
-  }
+}
 
-  Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
-                        std::unordered_map<string, FuncInfo> &functions_in) {
+Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
+                    std::unordered_map<string, FuncInfo> &functions_in) {
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
     DataType type;
@@ -199,15 +197,14 @@ namespace {
 
     }
     return Status::OK();
-  }
-
+}
 
-  Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
-                        GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> &functions_in) {
 
+Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
+                    GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> &functions_in) {
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
-
     std::unique_ptr<GrapplerItem> item = GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
+
     if (!item) {
       return errors::InvalidArgument("Failed to inline function ", func_node.op(), " instantiated by ", func_node.name());
     }
@@ -246,9 +243,7 @@ namespace {
       argmerge_map.emplace(arg.name(), merge);
     }
 
-
     for (NodeDef& func_body_node : *item->graph.mutable_node()) {
-
       // If the func body node is func's input argument
       if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
         CHECK_EQ(0, func_body_node.input_size());
@@ -258,10 +253,7 @@ namespace {
         }
         // Connect merge with input arg
         func_body_node.add_input(argmerge_map[func_body_node.name()]->name());
-      }
-
-      // Else if not an input_arg_node
-      else {
+      } else { // Else if not an input_arg_node
         // Update the input names if any.
         for (string& input : *func_body_node.mutable_input()) {
 
@@ -271,7 +263,6 @@ namespace {
           }
           input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
         }
-
         // If the node has no input, make hook it up to the Merge nodes to ensure
         // it runs in the same frame as the other nodes of the function body.
         if (func_body_node.input_size() == 0) {
@@ -335,7 +326,6 @@ namespace {
     // Break IdentityN Merges into multiple common Binary Merge ops
     int j=0;
     for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
-
       DataType type;
       NodeDef *new_merge, *merge = it->second;
       int i, size = merge->input_size();
@@ -346,15 +336,11 @@ namespace {
         merge->set_op("Identity");
         merge->set_device(func_node.device());
         (*merge->mutable_attr())["T"].set_type(type);
-      }
-
-      else {
-
+      } else {
         string name = merge->name();
         string in1 = merge->input(0), in2;
 
         for (i = 1; i < size-1; i++) {
-
           in2 = merge->input(i);
           new_merge = optimized_graph->add_node();
 
@@ -383,21 +369,19 @@ namespace {
     }
 
     return Status::OK();
-  }
+}
 
 }  // namespace
 
-
-  Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                          GraphDef* optimized_graph) {
-
+Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                      GraphDef* optimized_graph) {
     FunctionInliningContext function_inlining_ctx(item);
-
     std::set<string> foutputs;
+
     GatherOutputs(foutputs, item, function_inlining_ctx);
 
-  //std::cout << foutputs.size() << '\n';
-  //for( const auto& str : foutputs ) std::cout << str << '\n';
+    //std::cout << foutputs.size() << '\n';
+    //for( const auto& str : foutputs ) std::cout << str << '\n';
 
     // Nothing to do here.
     if (!function_inlining_ctx.HasInlinedFunctions()) {
@@ -455,8 +439,7 @@ namespace {
             }
           }
         }
-      }
-      else {
+      } else {
         *optimized_graph->add_node() = node;
       }
     }
@@ -488,12 +471,13 @@ namespace {
     /******************************************************************************************************/
 
     return Status::OK();
-  }
+}
 
-  void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
-                                   const GraphDef& optimized_graph,
-                                   double result) {
+void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
+                               const GraphDef& optimized_graph,
+                               double result) {
     // Nothing to do for FunctionOptimizer.
-  }
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.h b/tensorflow/core/grappler/optimizers/function_transformation.h
index 8436b751bb..feadcb1fd4 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.h
+++ b/tensorflow/core/grappler/optimizers/function_transformation.h
@@ -20,27 +20,27 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-  typedef std::unordered_map<string, NodeDef*> ArgMergeMap;
+typedef std::unordered_map<string, NodeDef*> ArgMergeMap;
 
-  typedef struct {
-      ArgMergeMap argMergeMap;
-      gtl::ArraySlice<string> fetch;
-  } FuncInfo;
+typedef struct {
+  ArgMergeMap argMergeMap;
+  gtl::ArraySlice<string> fetch;
+} FuncInfo;
 
-  // Replace function calling nodes with pairs of new 'Call/Return' operators
-	class FunctionTransformation : public GraphOptimizer {
-  	public:
-  		FunctionTransformation() {}
-  		~FunctionTransformation() override {}
+// Replace function calling nodes with pairs of new 'Call/Return' operators
+class FunctionTransformation : public GraphOptimizer {
+public:
+    FunctionTransformation() {}
+    ~FunctionTransformation() override {}
 
-  		string name() const override { return "function_transformation"; };
+    string name() const override { return "function_transformation"; };
 
-  		Status Optimize(Cluster* cluster, const GrapplerItem& item,
-  						GraphDef* optimized_graph) override;
+    Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                    GraphDef* optimized_graph) override;
 
-  		void Feedback(Cluster* cluster, const GrapplerItem& item,
-  					  const GraphDef& optimized_graph, double result) override;
-	};
+    void Feedback(Cluster* cluster, const GrapplerItem& item,
+                  const GraphDef& optimized_graph, double result) override;
+};
 
 }  // end namespace grappler
 }  // end namespace tensorflow

From 0afc0e516b2b055b9723cb9c3ea575a17b583792 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Fri, 13 Jul 2018 08:08:32 +0300
Subject: [PATCH 49/64] Change some more indentation

---
 .../optimizers/function_transformation.cc     | 54 ++++++++++---------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index dab1c2828e..049ad588eb 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -88,9 +88,9 @@ class FunctionInliningContext {
 // Copy input/output argument type to the type. Return error if argument
 // type is not explicitly defined, and not specified in function attributes.
 Status CopyArgType(const NodeDef& func_node,
-                 const std::unordered_map<string, AttrValue>& func_attr,
-                 const string& arg_kind, const OpDef::ArgDef& arg,
-                 DataType* type) {
+                   const std::unordered_map<string, AttrValue>& func_attr,
+                   const string& arg_kind, const OpDef::ArgDef& arg,
+                   DataType* type) {
     if (arg.type() != DT_INVALID) {
       *type = arg.type();
     } else {
@@ -108,9 +108,9 @@ Status CopyArgType(const NodeDef& func_node,
 // Copy input/output argument type to the type_list. Return error if argument
 // type is not explicitly defined, and not specified in function attributes.
 Status CopyArgTypeN(const NodeDef& func_node,
-                 const std::unordered_map<string, AttrValue>& func_attr,
-                 const string& arg_kind, const OpDef::ArgDef& arg,
-                 AttrValue::ListValue* type_list) {
+                    const std::unordered_map<string, AttrValue>& func_attr,
+                    const string& arg_kind, const OpDef::ArgDef& arg,
+                    AttrValue::ListValue* type_list) {
     if (arg.type() != DT_INVALID) {
       type_list->add_type(arg.type());
     } else {
@@ -141,10 +141,10 @@ string ParseString(string input) {
     return res;
 }
 
-Status GatherOutputs(std::set<string> &foutputs, const GrapplerItem& item,
-                   const FunctionInliningContext& function_inlining_ctx) {
+Status GatherOutputs(const GrapplerItem& item, const FunctionInliningContext& ctx,
+                     std::set<string> &foutputs) {
     for (const NodeDef& node : item.graph.node()) {
-      const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
+      const FunctionDef* func = ctx.FindInlinedFunction(node.op());
       if (func != nullptr) {      // If it's a function calling node
         for (int i = 0; i < func->signature().output_arg_size(); ++i) {
          // const OpDef::ArgDef &arg = func->signature().output_arg(i);
@@ -157,8 +157,9 @@ Status GatherOutputs(std::set<string> &foutputs, const GrapplerItem& item,
     return Status::OK();
 }
 
-Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
-                    std::unordered_map<string, FuncInfo> &functions_in) {
+Status CreateCycle(const NodeDef& func_node, const FunctionDef& func,
+                   GraphDef* optimized_graph,
+                   std::unordered_map<string, FuncInfo> &functions_in) {
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
     DataType type;
@@ -199,18 +200,21 @@ Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimi
     return Status::OK();
 }
 
-
-Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionInliningContext& ctx,
-                    GraphDef* optimized_graph, std::unordered_map<string, FuncInfo> &functions_in) {
+Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
+                      const FunctionInliningContext& ctx,
+                      GraphDef* optimized_graph,
+                      std::unordered_map<string, FuncInfo> &functions_in) {
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
     std::unique_ptr<GrapplerItem> item = GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
 
     if (!item) {
-      return errors::InvalidArgument("Failed to inline function ", func_node.op(), " instantiated by ", func_node.name());
+      return errors::InvalidArgument(
+                "Failed to inline function ", func_node.op(),
+                " instantiated by ", func_node.name());
     }
 
     std::set<string> foutputs;
-    GatherOutputs(foutputs, *item, ctx);
+    GatherOutputs(*item, ctx, foutputs);
 
     DataType type;
     std::unordered_map<string, int> input_nodes;
@@ -374,17 +378,17 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const F
 }  // namespace
 
 Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                      GraphDef* optimized_graph) {
-    FunctionInliningContext function_inlining_ctx(item);
+                                        GraphDef* optimized_graph) {
+    FunctionInliningContext ctx(item);
     std::set<string> foutputs;
 
-    GatherOutputs(foutputs, item, function_inlining_ctx);
+    GatherOutputs(item, ctx, foutputs);
 
     //std::cout << foutputs.size() << '\n';
     //for( const auto& str : foutputs ) std::cout << str << '\n';
 
     // Nothing to do here.
-    if (!function_inlining_ctx.HasInlinedFunctions()) {
+    if (!ctx.HasInlinedFunctions()) {
       *optimized_graph = item.graph;
       return Status::OK();
     }
@@ -400,11 +404,11 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
         }
       }
 
-      const FunctionDef* func = function_inlining_ctx.FindInlinedFunction(node.op());
+      const FunctionDef* func = ctx.FindInlinedFunction(node.op());
       if (func != nullptr) {
         FuncInfo func_info;
         functions_in.emplace(node.op(), func_info);
-        InlineFunction(node, *func, function_inlining_ctx, optimized_graph, functions_in);
+        InlineFunction(node, *func, ctx, optimized_graph, functions_in);
         functions_in.erase(node.op());      // At this point functions_in will be empty
 
         // Check if the function node corresponded to some fetch_outputs
@@ -474,9 +478,9 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
 }
 
 void FunctionTransformation::Feedback(Cluster* cluster, const GrapplerItem& item,
-                               const GraphDef& optimized_graph,
-                               double result) {
-    // Nothing to do for FunctionOptimizer.
+                                      const GraphDef& optimized_graph,
+                                      double result) {
+    // Nothing to do for FunctionTransformation.
 }
 
 }  // end namespace grappler

From e01e329bc9c74ea57da3f8739dbb627cc4b0eb0b Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Fri, 13 Jul 2018 10:47:53 +0300
Subject: [PATCH 50/64] Move typedef to cc

---
 .../optimizers/function_transformation.cc       | 17 +++++++++++++----
 .../optimizers/function_transformation.h        |  6 ------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 049ad588eb..b24603104f 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -35,8 +35,16 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+typedef std::unordered_map<string, NodeDef*> ArgMergeMap;
+
+typedef struct {
+  ArgMergeMap argMergeMap;
+  gtl::ArraySlice<string> fetch;
+} FuncInfo;
+
+// same with commit b691c0 (possibly)
 class FunctionInliningContext {
-public:
+  public:
     explicit FunctionInliningContext(const GrapplerItem& item)
             : library_(&item.graph.library()), functions_(InliningCandidates(item)) {}
 
@@ -54,7 +62,7 @@ class FunctionInliningContext {
       }
     }
 
-private:
+  private:
     std::unordered_map<string, const FunctionDef*> InliningCandidates(const GrapplerItem& item) const {
       std::unordered_map<string, const FunctionDef*> functions;
       for (const FunctionDef& func : item.graph.library().function()) {
@@ -465,8 +473,9 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
     const size_t proto_size = optimized_graph->ByteSizeLong();
     void* buf = port::Malloc(proto_size);
     if (buf == nullptr) {
-      return tensorflow::errors::ResourceExhausted("Failed to allocate memory to serialize message of type '"
-                                                   ,optimized_graph->GetTypeName(), "' and size ", proto_size);
+      return errors::ResourceExhausted(
+                "Failed to allocate memory to serialize message of type '" ,
+                optimized_graph->GetTypeName(), "' and size ", proto_size);
     }
     optimized_graph->SerializeToArray(buf, proto_size);
     const void* bf = buf;
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.h b/tensorflow/core/grappler/optimizers/function_transformation.h
index feadcb1fd4..8ed60b3061 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.h
+++ b/tensorflow/core/grappler/optimizers/function_transformation.h
@@ -20,12 +20,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-typedef std::unordered_map<string, NodeDef*> ArgMergeMap;
-
-typedef struct {
-  ArgMergeMap argMergeMap;
-  gtl::ArraySlice<string> fetch;
-} FuncInfo;
 
 // Replace function calling nodes with pairs of new 'Call/Return' operators
 class FunctionTransformation : public GraphOptimizer {

From 3d3cfcc505b51ee8647ab54e904ea777cd2d2b88 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Fri, 13 Jul 2018 16:30:57 +0300
Subject: [PATCH 51/64] Optimized Tags

- Call/Return frame_names are now small 1-2 character strings
- Removed ":0" suffix from each new generated frame
- added 'clock' in commit
---
 .../core/common_runtime/direct_session.cc     | 12 ++++++++
 tensorflow/core/common_runtime/executor.cc    | 28 ++++++++++++------
 .../common_runtime/graph_execution_state.cc   |  2 +-
 .../optimizers/function_transformation.cc     | 29 ++++++++++++-------
 4 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 8674831eac..5cd1e042c8 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -19,6 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <time.h>
+#include <iostream>
+
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -581,8 +584,13 @@ Status DirectSession::Run(const RunOptions& run_options,
     return errors::Cancelled("Run call was cancelled");
   }
 
+  clock_t t;
   for (const auto& item : executors_and_keys->items) {
+    t = clock();
+
     item.executor->RunAsync(args, barrier->Get());
+
+
   }
 
   WaitForNotification(&run_state, &step_cancellation_manager,
@@ -590,6 +598,10 @@ Status DirectSession::Run(const RunOptions& run_options,
                           ? run_options.timeout_in_ms()
                           : operation_timeout_in_ms_);
 
+  t = clock() - t;
+  std::cout << "time: " << t << " miliseconds" << std::endl;
+  std::cout << "time: " << t*1.0/CLOCKS_PER_SEC << " seconds" << std::endl;
+
   if (!cancellation_manager_->DeregisterCallback(cancellation_token)) {
     // The step has been cancelled: make sure we don't attempt to receive the
     // outputs as this would make it block forever.
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index c8acf3234b..7c0564bc50 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1242,6 +1242,10 @@ class ExecutorState {
                               const string& name) {
     return strings::StrCat(frame->frame_name, ";", iter_id, ";", name);
   }
+  // The unique name of a frame.
+  inline string MakeFrameNameFunctions(FrameState* frame, const string& name) {
+    return strings::StrCat(frame->frame_name, ";", name);
+  }
 
   // Find an existing or create a new child frame in the frame 'frame' at
   // iteration 'iter'.
@@ -1997,8 +2001,8 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
   FrameState* output_frame = input_frame;
   int64 output_iter = input_iter;
 
-printf("Propagate Outputs: %s\n", node->name().c_str());
-printf("Frame: %s\n", input_frame->frame_name.c_str());
+//printf("Propagate Outputs: %s\n", node->name().c_str());
+//printf("Frame: %s\n", input_frame->frame_name.c_str());
 
   if (!item->is_enter_exit_or_next_iter && !item->is_call_or_return) {
     // Fast path for nodes types that don't need special handling
@@ -2366,7 +2370,17 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   string enter_name;
   Status s = GetNodeAttr(node->attrs(), "frame_name", &enter_name);
   DCHECK(s.ok()) << s;
-  const string child_name = MakeFrameName(frame, iter, enter_name);
+  string child_name;
+
+  int parallel_iters = 1;
+  if (!IsCall(node)) {
+    s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
+    DCHECK(s.ok()) << s;
+
+    child_name = MakeFrameName(frame, iter, enter_name);
+  }
+
+  else child_name = MakeFrameNameFunctions(frame, enter_name);
 
   {
     mutex_lock executor_lock(mu_);
@@ -2381,11 +2395,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   // Note that this new frame instance is created without any locks.
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
-  int parallel_iters = 1;
-  if (!IsCall(node)) {
-    s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
-    DCHECK(s.ok()) << s;
-  }
+
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
   temp->frame_id = Hash64(child_name);
@@ -2569,7 +2579,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
 
         string frameName;
         GetNodeAttr(dst_item->node->attrs(), "frame_name", &frameName);
-        frameName = strings::StrCat(parent_frame->frame_name, ";0;", frameName);
+        frameName = strings::StrCat(parent_frame->frame_name, ";", frameName);
 
         wrong_ret = (frameName != frame_name);
       }
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 772e687187..be46a9885b 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -380,7 +380,7 @@ Status GraphExecutionState::OptimizeGraph(
     const void* bf = buf;
     event.set_graph_def(bf, proto_size);
     writer.WriteEvent(event);
-    printf("Transformation passed successfully!\n");
+//    printf("Transformation passed successfully!\n");
 /*******************************************************************************************/
     // The graph conversion sets the requested device names but not the assigned
     // device names. However, since at this point the graph is placed TF expects
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index b24603104f..c7173dd657 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -165,9 +165,9 @@ Status GatherOutputs(const GrapplerItem& item, const FunctionInliningContext& ct
     return Status::OK();
 }
 
-Status CreateCycle(const NodeDef& func_node, const FunctionDef& func,
-                   GraphDef* optimized_graph,
-                   std::unordered_map<string, FuncInfo> &functions_in) {
+
+Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
+                   std::unordered_map<string, FuncInfo> &functions_in , int& frame_name) {
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
     DataType type;
@@ -184,7 +184,7 @@ Status CreateCycle(const NodeDef& func_node, const FunctionDef& func,
       call->add_input(func_node.input(i));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
       (*call->mutable_attr())["T"].set_type(type);
-      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(frame_name));
       (*call->mutable_attr())["is_constant"].set_b(false);
 
       NodeDef* merge = argmerge_map[arg.name()];
@@ -202,16 +202,21 @@ Status CreateCycle(const NodeDef& func_node, const FunctionDef& func,
       ret->add_input(strings::StrCat(func_node.op(), "/", functions_in[func_node.op()].fetch[i]));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
       (*ret->mutable_attr())["T"].set_type(type);
-      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(frame_name));
 
     }
     return Status::OK();
 }
 
+
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                       const FunctionInliningContext& ctx,
                       GraphDef* optimized_graph,
-                      std::unordered_map<string, FuncInfo> &functions_in) {
+                      std::unordered_map<string, FuncInfo> &functions_in,
+                      int& frame_name) {
+
+    int cpframe_name = frame_name;
+
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
     std::unique_ptr<GrapplerItem> item = GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
 
@@ -242,7 +247,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       call->add_input(func_node.input(i));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
       (*call->mutable_attr())["T"].set_type(type);
-      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(frame_name));
       (*call->mutable_attr())["is_constant"].set_b(false);
 
       // Create and add a temporary merge node (IdentityN) for every input arg
@@ -303,12 +308,12 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
         if (it == functions_in.end()) {
           FuncInfo func_info;
           functions_in.emplace(func_body_node.op(), func_info);
-          InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in);
+          InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in, ++frame_name);
           functions_in.erase(func_body_node.op());
         } else {
           // Already in -> Insert Enter/Exit ops end create cycle
           //  (recursion or mutually recursive functions)
-          CreateCycle(func_body_node, *func_body_node_func, optimized_graph, functions_in);
+          CreateCycle(func_body_node, *func_body_node_func, optimized_graph, functions_in, ++frame_name);
         }
       } else {
         // Move the node to the main graph
@@ -332,7 +337,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       ret->add_input(strings::StrCat(func_node.name(), "/", input));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
       (*ret->mutable_attr())["T"].set_type(type);
-      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(func_node.name()));
+      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(cpframe_name));
     }
 
     // Break IdentityN Merges into multiple common Binary Merge ops
@@ -388,6 +393,8 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
 Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& item,
                                         GraphDef* optimized_graph) {
     FunctionInliningContext ctx(item);
+
+    int frame_name = 0;
     std::set<string> foutputs;
 
     GatherOutputs(item, ctx, foutputs);
@@ -416,7 +423,7 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
       if (func != nullptr) {
         FuncInfo func_info;
         functions_in.emplace(node.op(), func_info);
-        InlineFunction(node, *func, ctx, optimized_graph, functions_in);
+        InlineFunction(node, *func, ctx, optimized_graph, functions_in, frame_name);
         functions_in.erase(node.op());      // At this point functions_in will be empty
 
         // Check if the function node corresponded to some fetch_outputs

From bb403ff3a99f632181f2f9acce3e4138c2078967 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Fri, 13 Jul 2018 17:20:48 +0300
Subject: [PATCH 52/64] "Change MakeFrameFunctions to MakeFrameName;rely on
 overloading"

---
 tensorflow/core/common_runtime/executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 7c0564bc50..c1404f4b25 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1243,7 +1243,7 @@ class ExecutorState {
     return strings::StrCat(frame->frame_name, ";", iter_id, ";", name);
   }
   // The unique name of a frame.
-  inline string MakeFrameNameFunctions(FrameState* frame, const string& name) {
+  inline string MakeFrameName(FrameState* frame, const string& name) {
     return strings::StrCat(frame->frame_name, ";", name);
   }
 
@@ -2380,7 +2380,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
     child_name = MakeFrameName(frame, iter, enter_name);
   }
 
-  else child_name = MakeFrameNameFunctions(frame, enter_name);
+  else child_name = MakeFrameName(frame, enter_name);
 
   {
     mutex_lock executor_lock(mu_);

From 599c2740a5f4d00c0971dbdbffd9e10b4aaa8f6d Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Sat, 14 Jul 2018 16:12:21 +0300
Subject: [PATCH 53/64] More Opts

- Dead Returns are now not added in ready queue

- flatmap outstanding_frames_ was moved from ExecutorState to
  FrameState. Now each parent frame holds its children frames.
  (Maybe now that there will be only a few elements in oustanding_frames,
  FlatMap could be changed into a potentially better structure.)

- The tags have the form: <frame_id: frame_name>
---
 tensorflow/core/common_runtime/executor.cc | 73 +++++++++++-----------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index c1404f4b25..320998623a 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1048,6 +1048,13 @@ class ExecutorState {
     int total_input_tensors = 0;
     std::vector<const Node*>* nodes = nullptr;
 
+    // Mapping from frame name to outstanding frames. A new frame is created
+    // at some iteration of an active frame. So the unique key for the new
+    // child frame is composed of the name of the parent frame, the iteration
+    // number at which the parent frame is creating the new frame, and the
+    // name of the new frame from nodedef.
+    gtl::FlatMap<string, FrameState*> outstanding_child_frames_ GUARDED_BY(mu_);
+
     // Lock ordering: ExecutorState.mu_ < mu.
     mutex mu;
 
@@ -1230,13 +1237,6 @@ class ExecutorState {
   mutex mu_;
   Status status_ GUARDED_BY(mu_);
 
-  // Mapping from frame name to outstanding frames. A new frame is created
-  // at some iteration of an active frame. So the unique key for the new
-  // child frame is composed of the name of the parent frame, the iteration
-  // number at which the parent frame is creating the new frame, and the
-  // name of the new frame from nodedef.
-  gtl::FlatMap<string, FrameState*> outstanding_frames_ GUARDED_BY(mu_);
-
   // The unique name of a frame.
   inline string MakeFrameName(FrameState* frame, int64 iter_id,
                               const string& name) {
@@ -1244,7 +1244,7 @@ class ExecutorState {
   }
   // The unique name of a frame.
   inline string MakeFrameName(FrameState* frame, const string& name) {
-    return strings::StrCat(frame->frame_name, ";", name);
+    return strings::StrCat(frame->frame_id, ";", name);
   }
 
   // Find an existing or create a new child frame in the frame 'frame' at
@@ -1347,13 +1347,10 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
   root_frame_->iterations[0] = new IterationState(
       root_frame_->pending_counts, root_frame_->total_input_tensors);
 
-  outstanding_frames_.insert({root_frame_->frame_name, root_frame_});
 }
 
 ExecutorState::~ExecutorState() {
-  for (auto name_frame : outstanding_frames_) {
-    delete name_frame.second;
-  }
+
   for (auto it : device_context_map_) {
     it->Unref();
   }
@@ -2332,15 +2329,18 @@ void ExecutorState::DumpState() {
   mutex_lock l(mu_);
   if (!dumped_on_error_) {
     LOG(WARNING) << "Dumping state";
-    for (auto& frame : outstanding_frames_) {
-      LOG(WARNING) << frame.first;
-      FrameState* frame_state = frame.second;
-      mutex_lock frame_lock(frame_state->mu);
-      for (IterationState* iteration : frame_state->iterations) {
-        LOG(WARNING) << "  Iteration:";
-        DumpIterationState(frame_state, iteration);
-      }
-    }
+
+    // TODO : Make it print all this info recursively!
+
+//    for (auto& frame : outstanding_frames_) {
+//      LOG(WARNING) << frame.first;
+//      FrameState* frame_state = frame.second;
+//      mutex_lock frame_lock(frame_state->mu);
+//      for (IterationState* iteration : frame_state->iterations) {
+//        LOG(WARNING) << "  Iteration:";
+//        DumpIterationState(frame_state, iteration);
+//      }
+//    }
     dumped_on_error_ = true;
   }
 }
@@ -2383,9 +2383,9 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   else child_name = MakeFrameName(frame, enter_name);
 
   {
-    mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
-    if (it != outstanding_frames_.end()) {
+    mutex_lock frame_lock(frame->mu);
+    auto it = frame->outstanding_child_frames_.find(child_name);
+    if (it != frame->outstanding_child_frames_.end()) {
       *child = it->second;
       return;
     }
@@ -2410,14 +2410,13 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
       new IterationState(temp->pending_counts, temp->total_input_tensors);
 
   {
-    mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
-    if (it != outstanding_frames_.end()) {
+    mutex_lock frame_lock(frame->mu);
+    auto it = frame->outstanding_child_frames_.find(child_name);
+    if (it != frame->outstanding_child_frames_.end()) {
       *child = it->second;
     } else {
-      mutex_lock frame_lock(frame->mu);
       frame->GetIteration(iter)->outstanding_frame_count++;
-      outstanding_frames_[child_name] = temp;
+      frame->outstanding_child_frames_[child_name] = temp;
       *child = temp;
       temp = nullptr;
     }
@@ -2480,8 +2479,10 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   const string& frame_name = frame->frame_name;
   if (vlog_) VLOG(2) << "Delete frame " << frame_name;
   {
-    mutex_lock executor_lock(mu_);
-    outstanding_frames_.erase(frame_name);
+    if (frame->frame_id != 0) {
+      mutex_lock paranet_frame_lock(parent_frame->mu);
+      parent_frame->outstanding_child_frames_.erase(frame_name);
+    }
   }
   delete frame;
 }
@@ -2573,19 +2574,17 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
     } else {
       // In case of "Return" dst_node,
       // we compare node's frame attr  with current frame name
-      // if they are different, propagate as dead
-      bool wrong_ret = 0;
+      // if they are different, ignore this op
       if (dst_item->is_return) {
 
         string frameName;
         GetNodeAttr(dst_item->node->attrs(), "frame_name", &frameName);
-        frameName = strings::StrCat(parent_frame->frame_name, ";", frameName);
-
-        wrong_ret = (frameName != frame_name);
+        const string fullName = strings::StrCat(parent_frame->frame_id, ";", frameName);
+        if (fullName != frame_name) continue;
       }
 
       const bool increment_dead =
-          (is_dead || (!is_control_edge && !(*outputs)[src_slot].has_value) || wrong_ret);
+          (is_dead || (!is_control_edge && !(*outputs)[src_slot].has_value));
       int pending, dead;
       iter_state->adjust_for_activation(dst_pending_id, increment_dead,
                                         &pending, &dead);

From 6ce4f1a0e82fa1797d37e45136a7d374412f5225 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Mon, 16 Jul 2018 10:48:42 +0300
Subject: [PATCH 54/64] Change frame lock mutex and typos

---
 tensorflow/core/common_runtime/executor.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 320998623a..3651bc95d2 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1053,7 +1053,7 @@ class ExecutorState {
     // child frame is composed of the name of the parent frame, the iteration
     // number at which the parent frame is creating the new frame, and the
     // name of the new frame from nodedef.
-    gtl::FlatMap<string, FrameState*> outstanding_child_frames_ GUARDED_BY(mu_);
+    gtl::FlatMap<string, FrameState*> outstanding_child_frames_ GUARDED_BY(mu);
 
     // Lock ordering: ExecutorState.mu_ < mu.
     mutex mu;
@@ -2429,7 +2429,7 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   FrameState* parent_frame = frame->parent_frame;
   const int64 parent_iter = frame->parent_iter;
   if (parent_frame != nullptr) {
-    mutex_lock paranet_frame_lock(parent_frame->mu);
+    mutex_lock parent_frame_lock(parent_frame->mu);
     // Propagate all the dead exits to the parent frame.
     for (const Node* node : frame->dead_exits) {
       auto parent_iter_state = parent_frame->GetIteration(parent_iter);
@@ -2480,7 +2480,7 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   if (vlog_) VLOG(2) << "Delete frame " << frame_name;
   {
     if (frame->frame_id != 0) {
-      mutex_lock paranet_frame_lock(parent_frame->mu);
+      mutex_lock parent_frame_lock(parent_frame->mu);
       parent_frame->outstanding_child_frames_.erase(frame_name);
     }
   }

From 7223ca7834903f0fc03876c4ddca0009bbaed85c Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Mon, 16 Jul 2018 10:52:49 +0300
Subject: [PATCH 55/64] Minor change the creation of a frame

---
 tensorflow/core/common_runtime/executor.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 3651bc95d2..68aee246c0 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2372,16 +2372,16 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   DCHECK(s.ok()) << s;
   string child_name;
 
-  int parallel_iters = 1;
+  int parallel_iters;
   if (!IsCall(node)) {
     s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
     DCHECK(s.ok()) << s;
-
     child_name = MakeFrameName(frame, iter, enter_name);
+  } else {
+    parallel_iters = 1; // since this is not a loop scope there are no iterations
+    child_name = MakeFrameName(frame, enter_name);
   }
 
-  else child_name = MakeFrameName(frame, enter_name);
-
   {
     mutex_lock frame_lock(frame->mu);
     auto it = frame->outstanding_child_frames_.find(child_name);

From 0bf53cde5b9e4aa39085f3f09b672d8a91a4e670 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Mon, 16 Jul 2018 11:23:46 +0300
Subject: [PATCH 56/64] Refactor synonym_frames

Makes it inclusive to ControlFlowInfo.
Results into unchanged signature for BuildControlFlowInfo.
---
 tensorflow/core/common_runtime/executor.cc | 24 ++++++++++------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 68aee246c0..4e3636913a 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -368,6 +368,7 @@ class ExecutorImpl : public Executor {
   struct ControlFlowInfo {
     gtl::FlatSet<string> unique_frame_names;
     std::vector<string> frame_names;
+    std::unordered_map<string, std::set<string>>& synonym_frame_names;
   };
 
   struct FrameInfo {
@@ -401,8 +402,7 @@ class ExecutorImpl : public Executor {
   };
 
   static Status BuildControlFlowInfo(const Graph* graph,
-                                     ControlFlowInfo* cf_info,
-                                     std::unordered_map<string, std::set<string>>& synonym_frames);
+                                     ControlFlowInfo* cf_info);
   void InitializePending(const Graph* graph, const ControlFlowInfo& cf_info);
 
   FrameInfo* EnsureFrameInfo(const string& fname) {
@@ -608,12 +608,11 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 }
 
 Status ExecutorImpl::Initialize() {
-  std::unordered_map<string, std::set<string>> synonym_frames;
   gview_.Initialize(graph_);
 
   // Build the information about frames in this subgraph.
   ControlFlowInfo cf_info;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_, &cf_info, synonym_frames));
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_, &cf_info));
 
   // Cache this value so we make this virtual function call once, rather
   // that O(# steps * # nodes per step) times.
@@ -687,7 +686,7 @@ Status ExecutorImpl::Initialize() {
   InitializePending(graph_, cf_info);
 
   // Copy Synonym FrameInfos ------ is that necessary?
-  for (const auto& frame : synonym_frames)  {
+  for (const auto& frame : cf_info.synonym_frame_names)  {
     FrameInfo* copyFrom = EnsureFrameInfo(frame.first);
     for (const auto& syn : frame.second) {
       FrameInfo* frame_info = EnsureFrameInfo(syn);
@@ -701,7 +700,6 @@ Status ExecutorImpl::Initialize() {
         frame_info->nodes->push_back(n);
       }
     }
-
   }
   return gview_.SetAllocAttrs(graph_, params_.device);
 }
@@ -1358,9 +1356,7 @@ ExecutorState::~ExecutorState() {
 }
 
 Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
-                                          ControlFlowInfo* cf_info,
-                                          std::unordered_map<string, std::set<string>>& synonym_frames) {
-  std::unordered_map<string, int> synframeToCall;
+                                          ControlFlowInfo* cf_info) {
   const int num_nodes = g->num_node_ids();
   cf_info->frame_names.resize(num_nodes);
   std::vector<Node*> parent_nodes;
@@ -1380,6 +1376,8 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
     }
   }
 
+  std::unordered_map<string, int> synframeToCall;
+
   while (!ready.empty()) {
     Node* curr_node = ready.front();
     int curr_id = curr_node->id();
@@ -1410,15 +1408,15 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
         // Enter a child frame.
         parent = curr_node;
         // If not already in map, add it as a new key
-        if (synonym_frames.find(frame_name) == synonym_frames.end()) {
+        if (cf_info->synonym_frame_names.find(frame_name) == cf_info->synonym_frame_names.end()) {
           std::set <string> synonyms;
           synonyms.clear();
-          synonym_frames.emplace(frame_name, synonyms); // std::move()
+          cf_info->synonym_frame_names.emplace(frame_name, synonyms); // std::move()
         }
       } else {
         // Recursive call : either from within the same function or from another one
         // It's just a synonym frame
-        if (synonym_frames[cf_info->frame_names[out_id]].emplace(frame_name).second == true) {
+        if (cf_info->synonym_frame_names[cf_info->frame_names[out_id]].emplace(frame_name).second == true) {
           synframeToCall.emplace(frame_name, curr_id);
         }
       }
@@ -1426,7 +1424,7 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
       TF_RETURN_IF_ERROR(
           GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       // node corresponds to a recursive call
-      if (synonym_frames.find(frame_name) == synonym_frames.end()) {
+      if (cf_info->synonym_frame_names.find(frame_name) == cf_info->synonym_frame_names.end()) {
         std::unordered_map<std::string,int>::const_iterator it = synframeToCall.find(frame_name);
         if (it != synframeToCall.end()) {
           // we don't trust parent_nodes[curr_id] and cf_info->frame_names[curr_id]

From d53d6d9851de7871e13c6c4f9cf75daf098c16fc Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Mon, 16 Jul 2018 12:47:39 +0300
Subject: [PATCH 57/64] Remove var ref in synonym_frame_names

---
 tensorflow/core/common_runtime/executor.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 4e3636913a..24bcabedfd 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -368,7 +368,8 @@ class ExecutorImpl : public Executor {
   struct ControlFlowInfo {
     gtl::FlatSet<string> unique_frame_names;
     std::vector<string> frame_names;
-    std::unordered_map<string, std::set<string>>& synonym_frame_names;
+    std::unordered_map<string, std::set<string>> synonym_frame_names;
+    //std::unordered_multimap<string,string> synonym_frame_names;
   };
 
   struct FrameInfo {

From 3ca7c5b4d4dc0e6172dbd6fe0956dd0c72437713 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Mon, 16 Jul 2018 12:59:01 +0300
Subject: [PATCH 58/64] Untangle child_name-ing and parallel iters

---
 tensorflow/core/common_runtime/executor.cc | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 24bcabedfd..6cead3d832 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2369,17 +2369,9 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   string enter_name;
   Status s = GetNodeAttr(node->attrs(), "frame_name", &enter_name);
   DCHECK(s.ok()) << s;
-  string child_name;
-
-  int parallel_iters;
-  if (!IsCall(node)) {
-    s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
-    DCHECK(s.ok()) << s;
-    child_name = MakeFrameName(frame, iter, enter_name);
-  } else {
-    parallel_iters = 1; // since this is not a loop scope there are no iterations
-    child_name = MakeFrameName(frame, enter_name);
-  }
+  const string child_name = IsCall(node) ?
+        MakeFrameName(frame, enter_name) :
+        MakeFrameName(frame, iter, enter_name);
 
   {
     mutex_lock frame_lock(frame->mu);
@@ -2394,6 +2386,14 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   // Note that this new frame instance is created without any locks.
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
+  int parallel_iters;
+  s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
+  DCHECK(s.ok()) << s;
+
+  if (IsCall(node)) {
+    // since this is not a loop scope there are no iterations
+    parallel_iters = 1;
+  }
 
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;

From d034dc2fa5a2bc62d022b5d98648a331512a1006 Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Mon, 16 Jul 2018 21:52:11 +0300
Subject: [PATCH 59/64] Guard GetNodeAttr to access it when it's necessary

Apparently GetNodeAttr costs an arm and a leg.

Adds 2.0-2.2 sec in fib(30).
---
 tensorflow/core/common_runtime/executor.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 6cead3d832..832bbb3bbe 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2387,12 +2387,12 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
   int parallel_iters;
-  s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
-  DCHECK(s.ok()) << s;
-
   if (IsCall(node)) {
     // since this is not a loop scope there are no iterations
     parallel_iters = 1;
+  } else {
+    s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
+    DCHECK(s.ok()) << s;
   }
 
   FrameState* temp = new FrameState(impl_, parallel_iters);

From 3e3eac062ef454d6eeb200134b00f5caadec904f Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Tue, 17 Jul 2018 12:42:07 +0300
Subject: [PATCH 60/64] Use parent_frame not null check

---
 tensorflow/core/common_runtime/executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 832bbb3bbe..668ef045dd 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2478,7 +2478,7 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   const string& frame_name = frame->frame_name;
   if (vlog_) VLOG(2) << "Delete frame " << frame_name;
   {
-    if (frame->frame_id != 0) {
+    if (parent_frame != nullptr) {
       mutex_lock parent_frame_lock(parent_frame->mu);
       parent_frame->outstanding_child_frames_.erase(frame_name);
     }

From 8fb3e55f8fc9cea0db1fbe8db8b1a264f5008cdb Mon Sep 17 00:00:00 2001
From: Angelos Charalambidis <agcharal@gmail.com>
Date: Fri, 20 Jul 2018 19:38:09 +0300
Subject: [PATCH 61/64] Local executor optimizations (#8)

* Use VLOG instead of printf

* Fix typo

* Comment out eventwriters for performance

* Print diagnostics in VLOG instead of stdout

* Change CopyArgTypeN to CopyArgType; rely on function overloading

* Use Merge with N>=2 arguments

* Oops, missing semicolon

* Remove commented code

* Change the MakeFrameName to produce more compact names

* Fix compilation error in executor

* Fix compilation error in executor

* Change outstanding_frames to use frame_id instead of frame_name

* Revert "Change outstanding_frames to use frame_id instead of frame_name"

This reverts commit 707e41d975b38d92f60d1de6aa8d53c67f394177.

We revert since there we didn't get any performance gain anyway.
We may come back to that during fine-tuning.

* Cache frame_name in NodeItem

* Cache frame_name in NodeItem

* Cache frame_name in NodeItem

* Cache frame_name in NodeItem

* Cache frame_name in NodeItem

* Change the implementation of BuildControlFlowInfo

Remove synonym frames and use two kind of frames for calls:
static and dynamic where static corresponds to the static information
which is per function and dynamic which is per function call.

Squashed commit of the following:

commit d32d488cd53ec291dbb4a795e636c0d599effd34
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Wed Jul 18 14:23:32 2018 +0300

    Use call_id as check of return

commit 390d2e7491f0ce1efa1ea0e19bfd7e454a10a001
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Wed Jul 18 14:22:22 2018 +0300

    Use call_id as check of return

commit 364db3aa66e91a09c1e124425d41ea74599877a3
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Wed Jul 18 13:58:01 2018 +0300

    Change outstanding_frames to use frame_id instead of frame_name

commit 7095ea29913a6f29b3c94a825bdbb7214198d1d4
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Wed Jul 18 13:49:30 2018 +0300

    Syntax error fixed

commit cf0c48b109e8896ccb434cad3e78a84eb3909f0c
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Wed Jul 18 13:05:56 2018 +0300

    Fix input_count to count corrently the inputs of calls

commit 78feb11fb71344cea053ebc9b2345ec1034c7078
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Tue Jul 17 18:38:40 2018 +0300

    Fix error

commit 9aeec10eec7916104b17c8e3ee179ebcb08fb8ff
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Tue Jul 17 18:30:10 2018 +0300

    Fix syntax error

commit b2861747315302baba2be7c71722ae8e3f5e758a
Merge: e92ef53bd2 c7e335cbd2
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Tue Jul 17 18:02:36 2018 +0300

    Merge branch 'r1.4_recursion_synonyms' into r1.4_recursion_opt_merge

commit c7e335cbd28e49bf90143f0b4872bdea432c5310
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Tue Jul 17 11:59:41 2018 +0300

    Simplify BuildControlFlowInfo

commit 8423b3a3d43d430f34c5ccd8d0e9b4aa59049e58
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Tue Jul 17 11:55:28 2018 +0300

    Simplify BuildControlFlowInfo

commit 8d80595397892bb1cb0dd6ac26209155ee50aac0
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Tue Jul 17 11:43:01 2018 +0300

    Refactor BuildControlFlowInfo

commit c252b6cb6ee7c48e46ab722c64667f2812e2522e
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Tue Jul 17 11:41:08 2018 +0300

    Refactor BuildControlFlowInfo

commit c53327e6fbe1b97f3044f412a49cca992c37c15b
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 23:46:07 2018 +0300

    Fix incompatible Return name

commit 564b1abf1cf8d711608f98f0630368a41b21c0ed
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 21:55:55 2018 +0300

    Leave only call_id as dyn frame name

commit 8749ec86b39e23e3e9d42230fe841015daf328bf
Merge: 0fcd2b5dd6 d034dc2fa5
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 21:53:51 2018 +0300

    Merge branch 'r1.4_recursion' into r1.4_recursion_synonyms

commit 0fcd2b5dd6b62cc1f8ac8285938f0646fcf10170
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 18:57:50 2018 +0300

    Fix frame creation

commit f8fca8aa3ee06297af42ea92b8e84b8722ba15c1
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 16:23:52 2018 +0300

    Fix syntax error

commit 26b3393ef7c39dba822dddce60bb4def61a0a501
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 16:20:47 2018 +0300

    Fix syntax error

commit f47322c59e1916fd69139a1ddd36cc0cd45834cf
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 16:11:07 2018 +0300

    Fix syntax error

commit c278522ce39e509df326a80824de485aa5b81fe0
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 16:02:09 2018 +0300

    Fix some typo

commit aca021896c1d58585298a63e613ec50183a0a5cb
Author: Angelos Charalambidis <agcharal@gmail.com>
Date:   Mon Jul 16 15:40:34 2018 +0300

    Keep separate info for function name and call id.

    Record at function transformation the function name, call id and arg id.
    This deprecates the need of computing synonym frames in every execution run.
    function name is named "frame_name" and is used to index the static frame info,
    i.e. all calls of the function share the same static frame info.

    The runtime framestate is named after the concatenation of the
    parent_id, function_name and call_id to ensure uniqueness.

    Hope it works..

* Exit has no frame_name (apparently)

* Quick Fix on Topological Sort
---
 tensorflow/core/common_runtime/executor.cc    | 166 ++++++++----------
 .../common_runtime/graph_execution_state.cc   |   6 +-
 .../optimizers/function_transformation.cc     |  80 ++++-----
 .../core/grappler/utils/topological_sort.cc   |  20 ++-
 tensorflow/core/ops/function_control_ops.cc   |   8 +
 5 files changed, 135 insertions(+), 145 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 668ef045dd..00f69a0a41 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -237,6 +237,11 @@ struct NodeItem {
   // Number of output edges.
   size_t num_output_edges;
 
+  string frame_name; // cache the attribute if is_enter | is-exit | is_call | is_return
+  string dyn_frame_name; // cache the attribute if is_enter | is-exit | is_call | is_return
+
+  int call_id = -1;
+
   PendingCounts::Handle pending_id;
 
   const EdgeInfo* output_edge_list() const { return output_edge_base(); }
@@ -368,8 +373,6 @@ class ExecutorImpl : public Executor {
   struct ControlFlowInfo {
     gtl::FlatSet<string> unique_frame_names;
     std::vector<string> frame_names;
-    std::unordered_map<string, std::set<string>> synonym_frame_names;
-    //std::unordered_multimap<string,string> synonym_frame_names;
   };
 
   struct FrameInfo {
@@ -624,6 +627,8 @@ Status ExecutorImpl::Initialize() {
     EnsureFrameInfo(it)->nodes = new std::vector<const Node*>;
   }
 
+  std::unordered_map<string,int> input_count;
+
   // Preprocess every node in the graph to create an instance of op
   // kernel for each node.
   for (const Node* n : graph_->nodes()) {
@@ -675,10 +680,23 @@ Status ExecutorImpl::Initialize() {
 
     // Initialize static information about the frames in the graph.
     frame_info->nodes->push_back(n);
-    if (IsEnter(n) || IsCall(n)) {
-      string enter_name;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &enter_name));
-      EnsureFrameInfo(enter_name)->input_count++;
+    if (IsEnter(n)) {
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &item->frame_name));
+        item->dyn_frame_name = item->frame_name;
+    }
+    if (item->is_call_or_return) {
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &item->frame_name));
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "call_id", &item->call_id));
+        item->dyn_frame_name = strings::StrCat(item->call_id);
+    }
+    if (item->is_enter) {
+      EnsureFrameInfo(item->frame_name)->input_count++;
+    }
+    if (item->is_call) {
+      input_count[item->dyn_frame_name]++;
+      // The following assumes that all the calls of same function have the same number of inputs
+      // which is of course apparent for a well-formed graph (produced by the transformation)
+      EnsureFrameInfo(item->frame_name)->input_count = input_count[item->dyn_frame_name];
     }
   }
 
@@ -686,22 +704,6 @@ Status ExecutorImpl::Initialize() {
   // all nodes.
   InitializePending(graph_, cf_info);
 
-  // Copy Synonym FrameInfos ------ is that necessary?
-  for (const auto& frame : cf_info.synonym_frame_names)  {
-    FrameInfo* copyFrom = EnsureFrameInfo(frame.first);
-    for (const auto& syn : frame.second) {
-      FrameInfo* frame_info = EnsureFrameInfo(syn);
-      // Copy FrameInfo
-      frame_info->total_inputs = copyFrom->total_inputs;
-      frame_info->input_count = copyFrom->input_count;
-      frame_info->pending_counts_layout = copyFrom->pending_counts_layout;
-      frame_info->pending_counts = new PendingCounts(*copyFrom->pending_counts);
-      frame_info->nodes = new std::vector<const Node*>;
-      for (const Node* n : *copyFrom->nodes) {
-        frame_info->nodes->push_back(n);
-      }
-    }
-  }
   return gview_.SetAllocAttrs(graph_, params_.device);
 }
 
@@ -1002,6 +1004,9 @@ class ExecutorState {
     // frame_name.
     uint64 frame_id;
 
+
+    int call_id = -1;
+
     // The iteration id of its parent frame when this frame is created.
     // -1 if there is no parent frame. The frame_name/parent_iter pair
     // uniquely identifies this FrameState.
@@ -1052,7 +1057,7 @@ class ExecutorState {
     // child frame is composed of the name of the parent frame, the iteration
     // number at which the parent frame is creating the new frame, and the
     // name of the new frame from nodedef.
-    gtl::FlatMap<string, FrameState*> outstanding_child_frames_ GUARDED_BY(mu);
+    gtl::FlatMap<uint64, FrameState*> outstanding_child_frames_ GUARDED_BY(mu);
 
     // Lock ordering: ExecutorState.mu_ < mu.
     mutex mu;
@@ -1239,7 +1244,8 @@ class ExecutorState {
   // The unique name of a frame.
   inline string MakeFrameName(FrameState* frame, int64 iter_id,
                               const string& name) {
-    return strings::StrCat(frame->frame_name, ";", iter_id, ";", name);
+    //return strings::StrCat(frame->frame_name, frame->frame_id, ";", iter_id, ";", name);
+    return strings::StrCat(frame->frame_id, ";", iter_id, ";", name);
   }
   // The unique name of a frame.
   inline string MakeFrameName(FrameState* frame, const string& name) {
@@ -1377,7 +1383,7 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
     }
   }
 
-  std::unordered_map<string, int> synframeToCall;
+  std::unordered_map<int, int> call_id_to_call_node_id;
 
   while (!ready.empty()) {
     Node* curr_node = ready.front();
@@ -1398,53 +1404,35 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
     } else if (IsCall(curr_node)) {
       TF_RETURN_IF_ERROR(
           GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
-      int out_id;
-      // Remove for loop and grab the only actual output of the call node
-      for (const Edge* out_edge : curr_node->out_edges()) {
-        out_id = out_edge->dst()->id();
-        break;
-      }
-      // Not a recursive call
-      if (!visited[out_id]) {
-        // Enter a child frame.
-        parent = curr_node;
-        // If not already in map, add it as a new key
-        if (cf_info->synonym_frame_names.find(frame_name) == cf_info->synonym_frame_names.end()) {
-          std::set <string> synonyms;
-          synonyms.clear();
-          cf_info->synonym_frame_names.emplace(frame_name, synonyms); // std::move()
-        }
-      } else {
-        // Recursive call : either from within the same function or from another one
-        // It's just a synonym frame
-        if (cf_info->synonym_frame_names[cf_info->frame_names[out_id]].emplace(frame_name).second == true) {
-          synframeToCall.emplace(frame_name, curr_id);
-        }
-      }
+
+      int call_id;
+
+      TF_RETURN_IF_ERROR(
+                GetNodeAttr(curr_node->attrs(), "call_id", &call_id));
+      // we assume that call_id is unique and we don't need to concat with frame_name
+      // to make it unique.
+
+      call_id_to_call_node_id.emplace(call_id, curr_id);
+
+      parent = curr_node;
+
     } else if (IsReturn(curr_node)) {
+
+      int call_id;
+
       TF_RETURN_IF_ERROR(
-          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
-      // node corresponds to a recursive call
-      if (cf_info->synonym_frame_names.find(frame_name) == cf_info->synonym_frame_names.end()) {
-        std::unordered_map<std::string,int>::const_iterator it = synframeToCall.find(frame_name);
-        if (it != synframeToCall.end()) {
-          // we don't trust parent_nodes[curr_id] and cf_info->frame_names[curr_id]
-          // values that were set by the predecessor as they might be wrong in
-          // case of mutually recursive functions
-          int call_id = it->second;
-          parent = parent_nodes[call_id];
-          frame_name = cf_info->frame_names[call_id];
-        } else {
-          // node corresponds to a recursive call we have not already encountered
-          // Insert back in queue so it will be processed again after synonym frame is created
-          ready.push_back(curr_node);
-          continue;
-        }
+          GetNodeAttr(curr_node->attrs(), "call_id", &call_id));
+
+      auto it = call_id_to_call_node_id.find(call_id);
+
+      if (it != call_id_to_call_node_id.end()) {
+        int call_node_id = it->second;
+        parent = parent_nodes[call_node_id];
+        frame_name = cf_info->frame_names[call_node_id];
       } else {
-        // Exit to the parent frame.
-        parent = parent_nodes[curr_id];
-        frame_name = cf_info->frame_names[parent->id()];
-        parent = parent_nodes[parent->id()];
+        // is this even possible (encounter a Return before a Call) ??
+        ready.push_back(curr_node);
+        continue;
       }
     } else {
       parent = parent_nodes[curr_id];
@@ -1997,8 +1985,10 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
   FrameState* output_frame = input_frame;
   int64 output_iter = input_iter;
 
-//printf("Propagate Outputs: %s\n", node->name().c_str());
-//printf("Frame: %s\n", input_frame->frame_name.c_str());
+  if (vlog_) {
+    VLOG(2) << "Propagate Outputs: " << node->name();
+    VLOG(2) << "Frame: " << input_frame->frame_name;
+  }
 
   if (!item->is_enter_exit_or_next_iter && !item->is_call_or_return) {
     // Fast path for nodes types that don't need special handling
@@ -2365,17 +2355,19 @@ void ExecutorState::Finish() {
 void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
                                            const Node* node,
                                            FrameState** child) {
-  // Get the child frame name.
-  string enter_name;
-  Status s = GetNodeAttr(node->attrs(), "frame_name", &enter_name);
-  DCHECK(s.ok()) << s;
-  const string child_name = IsCall(node) ?
-        MakeFrameName(frame, enter_name) :
-        MakeFrameName(frame, iter, enter_name);
+  const GraphView& gview = impl_->gview_;
+  const NodeItem* item = gview.node(node->id());
+  Status s;
+  const string& enter_name = item->frame_name;
+  const string& dyn_frame_name = item->dyn_frame_name;
+  const string child_name = item->is_call ?
+        MakeFrameName(frame, dyn_frame_name) :
+        MakeFrameName(frame, iter, dyn_frame_name);
+  const uint64 child_id = Hash64(child_name);
 
   {
     mutex_lock frame_lock(frame->mu);
-    auto it = frame->outstanding_child_frames_.find(child_name);
+    auto it = frame->outstanding_child_frames_.find(child_id);
     if (it != frame->outstanding_child_frames_.end()) {
       *child = it->second;
       return;
@@ -2397,9 +2389,10 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
 
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
-  temp->frame_id = Hash64(child_name);
+  temp->frame_id = child_id;
   temp->parent_frame = frame;
   temp->parent_iter = iter;
+  temp->call_id = item->call_id;
   temp->InitializeFrameInfo(enter_name);
 
   // 'iterations' is a fixed-length circular buffer.
@@ -2410,12 +2403,12 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
 
   {
     mutex_lock frame_lock(frame->mu);
-    auto it = frame->outstanding_child_frames_.find(child_name);
+    auto it = frame->outstanding_child_frames_.find(child_id);
     if (it != frame->outstanding_child_frames_.end()) {
       *child = it->second;
     } else {
       frame->GetIteration(iter)->outstanding_frame_count++;
-      frame->outstanding_child_frames_[child_name] = temp;
+      frame->outstanding_child_frames_[child_id] = temp;
       *child = temp;
       temp = nullptr;
     }
@@ -2480,7 +2473,7 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   {
     if (parent_frame != nullptr) {
       mutex_lock parent_frame_lock(parent_frame->mu);
-      parent_frame->outstanding_child_frames_.erase(frame_name);
+      parent_frame->outstanding_child_frames_.erase(frame->frame_id);
     }
   }
   delete frame;
@@ -2575,11 +2568,8 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
       // we compare node's frame attr  with current frame name
       // if they are different, ignore this op
       if (dst_item->is_return) {
-
-        string frameName;
-        GetNodeAttr(dst_item->node->attrs(), "frame_name", &frameName);
-        const string fullName = strings::StrCat(parent_frame->frame_id, ";", frameName);
-        if (fullName != frame_name) continue;
+        if (dst_item->call_id != call_id)
+            continue;
       }
 
       const bool increment_dead =
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index be46a9885b..7ace622237 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -363,7 +363,7 @@ Status GraphExecutionState::OptimizeGraph(
     optimized_graph->reset(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(
         ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
-/*******************************************************************************************/
+/*******************************************************************************************
     // Write an event, so that we can visualize this optimized graph in tensorboard
     EventsWriter writer("Fully_Optimized");
     Event event;
@@ -380,8 +380,10 @@ Status GraphExecutionState::OptimizeGraph(
     const void* bf = buf;
     event.set_graph_def(bf, proto_size);
     writer.WriteEvent(event);
-//    printf("Transformation passed successfully!\n");
 /*******************************************************************************************/
+
+    VLOG(1) << "Transformation passed successfully";
+
     // The graph conversion sets the requested device names but not the assigned
     // device names. However, since at this point the graph is placed TF expects
     // an assigned device name for every node. Therefore we copy the requested
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index c7173dd657..72b47cfd84 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -115,7 +115,7 @@ Status CopyArgType(const NodeDef& func_node,
 
 // Copy input/output argument type to the type_list. Return error if argument
 // type is not explicitly defined, and not specified in function attributes.
-Status CopyArgTypeN(const NodeDef& func_node,
+Status CopyArgType(const NodeDef& func_node,
                     const std::unordered_map<string, AttrValue>& func_attr,
                     const string& arg_kind, const OpDef::ArgDef& arg,
                     AttrValue::ListValue* type_list) {
@@ -167,7 +167,7 @@ Status GatherOutputs(const GrapplerItem& item, const FunctionInliningContext& ct
 
 
 Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
-                   std::unordered_map<string, FuncInfo> &functions_in , int& frame_name) {
+                   std::unordered_map<string, FuncInfo> &functions_in, int call_id) {
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
     DataType type;
@@ -184,7 +184,9 @@ Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimi
       call->add_input(func_node.input(i));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
       (*call->mutable_attr())["T"].set_type(type);
-      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(frame_name));
+      (*call->mutable_attr())["frame_name"].set_s(func_node.op());
+      (*call->mutable_attr())["call_id"].set_i(call_id);
+      (*call->mutable_attr())["arg_id"].set_i(i);
       (*call->mutable_attr())["is_constant"].set_b(false);
 
       NodeDef* merge = argmerge_map[arg.name()];
@@ -202,8 +204,9 @@ Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimi
       ret->add_input(strings::StrCat(func_node.op(), "/", functions_in[func_node.op()].fetch[i]));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
       (*ret->mutable_attr())["T"].set_type(type);
-      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(frame_name));
-
+      (*ret->mutable_attr())["frame_name"].set_s(func_node.op());
+      (*ret->mutable_attr())["call_id"].set_i(call_id);
+      (*ret->mutable_attr())["arg_id"].set_i(i);
     }
     return Status::OK();
 }
@@ -247,7 +250,9 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       call->add_input(func_node.input(i));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
       (*call->mutable_attr())["T"].set_type(type);
-      (*call->mutable_attr())["frame_name"].set_s(strings::StrCat(frame_name));
+      (*call->mutable_attr())["frame_name"].set_s(func_node.op());
+      (*call->mutable_attr())["call_id"].set_i(frame_name);
+      (*call->mutable_attr())["arg_id"].set_i(i);
       (*call->mutable_attr())["is_constant"].set_b(false);
 
       // Create and add a temporary merge node (IdentityN) for every input arg
@@ -337,52 +342,31 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       ret->add_input(strings::StrCat(func_node.name(), "/", input));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
       (*ret->mutable_attr())["T"].set_type(type);
-      (*ret->mutable_attr())["frame_name"].set_s(strings::StrCat(cpframe_name));
+      (*ret->mutable_attr())["frame_name"].set_s(func_node.op());
+      (*ret->mutable_attr())["call_id"].set_i(cpframe_name);
+      (*ret->mutable_attr())["arg_id"].set_i(i);
     }
 
     // Break IdentityN Merges into multiple common Binary Merge ops
     int j=0;
     for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
-      DataType type;
-      NodeDef *new_merge, *merge = it->second;
-      int i, size = merge->input_size();
-      TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", func.signature().input_arg(j), &type));
-
-      // If there is only one call site
-      if (size < 2) {
-        merge->set_op("Identity");
-        merge->set_device(func_node.device());
-        (*merge->mutable_attr())["T"].set_type(type);
-      } else {
-        string name = merge->name();
-        string in1 = merge->input(0), in2;
-
-        for (i = 1; i < size-1; i++) {
-          in2 = merge->input(i);
-          new_merge = optimized_graph->add_node();
-
-          name = strings::StrCat(name, size - i - 1);
-          new_merge->set_name(name);
-          new_merge->set_op("Merge");
-          new_merge->set_device(func_node.device());
-          new_merge->add_input(in1);
-          new_merge->add_input(in2);
-          (*new_merge->mutable_attr())["T"].set_type(type);
-          (*new_merge->mutable_attr())["N"].set_i(2);
-
-          in1 = name;
-        }
+        DataType type;
+        NodeDef *new_merge, *merge = it->second;
+        int i, size = merge->input_size();
 
-        // Modify initial Merge
-        in2 = merge->input(i);
-        merge->set_op("Merge");
-        merge->set_device(func_node.device());
-        merge->clear_input();
-        merge->add_input(in1);
-        merge->add_input(in2);
-        (*merge->mutable_attr())["T"].set_type(type);
-        (*merge->mutable_attr())["N"].set_i(2);
-      }
+        TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr,
+                "input", func.signature().input_arg(j), &type));
+
+        if (size <= 1) {
+            merge->set_op("Identity");
+            merge->set_device(func_node.device());
+            (*merge->mutable_attr())["T"].set_type(type);
+        } else {
+            merge->set_op("Merge");
+            merge->set_device(func_node.device());
+            (*merge->mutable_attr())["T"].set_type(type);
+            (*merge->mutable_attr())["N"].set_i(size);
+        }
     }
 
     return Status::OK();
@@ -447,7 +431,7 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
 
               AttrValue::ListValue* type_list = (*idN->mutable_attr())["T"].mutable_list();
               for (const OpDef::ArgDef& arg : func->signature().output_arg()) {
-                TF_RETURN_IF_ERROR(CopyArgTypeN(node, func_attr, "input", arg, type_list));
+                TF_RETURN_IF_ERROR(CopyArgType(node, func_attr, "input", arg, type_list));
               }
 
               idN->add_input(strings::StrCat(node.name(), "/Ret", id.second));
@@ -466,7 +450,7 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
     *optimized_graph->mutable_versions() = item.graph.versions();
     *optimized_graph->mutable_library() = item.graph.library();
 
-    /******************************************************************************************************/
+    /******************************************************************************************************
     // Dumps optimized graph in a not so readable form
     // const GraphDef* tmp = optimized_graph;
     // printf("Summarize Optimized Graph\n %s\n", SummarizeGraphDef(*tmp).c_str());
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 5812c14c90..86c56a2c6c 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -33,13 +33,15 @@ void TopologicalSort(GraphDef* graph) {
   int front = 0;
   int back = 0;
   std::unordered_map<const NodeDef*, int> ready_inputs;
-  std::unordered_map<const NodeDef*, std::set<string>> returning_nodes;
+  std::unordered_map<const NodeDef*, std::set<int>> returning_nodes;
   for (int i = 0; i < graph->node_size(); i++) {
     auto node = graph->mutable_node(i);
     if (node->input_size() == 0) {
       ready_nodes.push_back(node);
       back++;
     }
+    bool recursion_merge = 0;
+
     if (IsMerge(*node)) {
       ready_inputs[node] = 0;
       for (const auto& input : node->input()) {
@@ -47,11 +49,15 @@ void TopologicalSort(GraphDef* graph) {
           ready_inputs[node]++;
         }
         else if (IsCall(*output_map.GetNode(input))) {
-          // We don't want to increase merge's ready_inputs
-          // every time we meet a Call input. Just Once.
-          ready_inputs[node] = 1;
+          ready_inputs[node] ++;
+          recursion_merge = 1;
         }
       }
+      if (recursion_merge) {
+        ready_inputs[node]--;
+        recursion_merge = 0;
+      }
+
     } else if (IsReturn(*node)) {
       // Nodes that send their output to "Return" nodes are
       // function Returning Nodes and in case of recursive functions
@@ -64,9 +70,9 @@ void TopologicalSort(GraphDef* graph) {
         // with different "frame_name" attributes (same "frame_name"
         // attrs would mean that they belong in the same function call
         // but they correspond to different function outputs)
-        string frame_name;
-        GetNodeAttr(AttrSlice(*node), "frame_name", &frame_name);
-        returning_nodes[prevNode].emplace(frame_name);
+        int call_id;
+        GetNodeAttr(AttrSlice(*node), "call_id", &call_id);
+        returning_nodes[prevNode].emplace(call_id);
       }
       ready_inputs[node] = 0;
 
diff --git a/tensorflow/core/ops/function_control_ops.cc b/tensorflow/core/ops/function_control_ops.cc
index 829b7e5cd9..fbb74aad89 100644
--- a/tensorflow/core/ops/function_control_ops.cc
+++ b/tensorflow/core/ops/function_control_ops.cc
@@ -25,6 +25,8 @@ REGISTER_OP("Call")
     .Output("output: T")
     .Attr("T: type")
     .Attr("frame_name: string")
+    .Attr("call_id: int")
+    .Attr("arg_id: int")
     .Attr("is_constant: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
@@ -62,6 +64,8 @@ REGISTER_OP("RefCall")
     .Output("output: Ref(T)")
     .Attr("T: type")
     .Attr("frame_name: string")
+    .Attr("call_id: int")
+    .Attr("arg_id: int")
     .Attr("is_constant: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"Doc(
@@ -84,6 +88,8 @@ REGISTER_OP("Return")
 .Output("output: T")
 .Attr("T: type")
 .Attr("frame_name: string")
+.Attr("call_id: int")
+.Attr("arg_id: int")
 .SetShapeFn(shape_inference::UnchangedShape)
 .Doc(R"Doc(
 Exits the current frame to its parent frame.
@@ -97,6 +103,8 @@ REGISTER_OP("RefReturn")
 .Output("output: Ref(T)")
 .Attr("T: type")
 .Attr("frame_name: string")
+.Attr("call_id: int")
+.Attr("arg_id: int")
 .SetShapeFn(shape_inference::UnchangedShape)
 .Doc(R"Doc(
 Exits the current frame to its parent frame.

From 4c192b455b2e9fc4596816b82b1d2b25ca76a378 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Wed, 5 Sep 2018 15:01:21 +0300
Subject: [PATCH 62/64] bugs fixed while working on distr

---
 tensorflow/core/graph/graph_constructor.cc    | 88 +++++++++++++++----
 .../optimizers/function_transformation.cc     |  2 +-
 .../core/grappler/utils/topological_sort.cc   | 16 ++--
 3 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index b58f0a0d14..cb0f230311 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -147,7 +147,10 @@ class GraphConstructor {
         original_versions_(g->versions()),
         refiner_(refiner),
         return_tensors_(return_tensors),
-        unused_input_map_keys_(unused_input_map_keys) {}
+        unused_input_map_keys_(unused_input_map_keys) {
+
+        SetFunctionReturningNodes(node_defs);
+  }
 
   Status TryImport() {
     TF_RETURN_IF_ERROR(EnsureNoNameCollisions());
@@ -193,7 +196,52 @@ class GraphConstructor {
   void AddPrefixToNodeDef(const std::vector<bool>& input_already_exists,
                           NodeDef* node_def);
 
-  // From constructor
+  bool IsReturningNode(const NodeDef& node_def) {
+    return (function_returning_nodes_.find(node_def.name()) !=
+                                       function_returning_nodes_.end());
+  }
+
+  void SetFunctionReturningNodes(const NodeDefSlice& node_defs) {
+
+    std::unordered_map<string, std::set<int>> returning_nodes;
+
+    for (int n = 0; n < node_defs.size(); ++n) {
+      const NodeDef& node_def = *node_defs[n];
+      if (IsReturn(node_def)) {
+        // Nodes that send their output to "Return" nodes are
+        // function Returning Nodes and in case of recursive functions
+        // those nodes are part of graph cycles.
+        for (const auto& input : node_def.input()) {
+          // In order to detect the recursion cycles we depend on
+          // the fact that a recursive function's returning node,
+          // will be sending outputs to at least 2 "Return" nodes
+          // with different "call_id" attributes (same "call_id"
+          // attrs would mean that they belong in the same function call
+          // but they correspond to different function outputs)
+          if (!StringPiece(input).starts_with("^")) {
+            int call_id;
+            GetNodeAttr(AttrSlice(node_def), "call_id", &call_id);
+
+            size_t pos;
+            string prevNode;
+            ((pos = input.find(":")) != std::string::npos) ?
+            (prevNode = input.substr(0, pos)) : (prevNode = input);
+
+            returning_nodes[prevNode].emplace(call_id);
+          }
+        }
+      }
+    }
+    for (auto& retnode : returning_nodes) {
+      if (retnode.second.size() > 1) {
+        // Detected Cycle
+        function_returning_nodes_.insert(retnode.first);
+      }
+    }
+  }
+
+
+    // From constructor
   const Options opts_;
   const NodeDefSlice node_defs_;
   const VersionDef* versions_;
@@ -261,6 +309,8 @@ class GraphConstructor {
     int dst_index;
   };
   std::vector<EdgeInfo> back_edges_;
+
+  std::unordered_set<string> function_returning_nodes_;
 };
 
 // This could be expensive but we don't expect to call it often, if at all (only
@@ -434,12 +484,23 @@ Status GraphConstructor::InitFromEdges() {
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
     const NodeDef& node_def = *node_defs_[n];
-    if (IsMerge(node_def)) {
+
+    if (IsReturningNode(node_def)) {
+      int32 num_control_edges = 0;
+      for (int i = 0; i < node_def.input_size(); ++i) {
+        if (StringPiece(node_def.input(i)).starts_with("^")) {
+          num_control_edges++;
+        }
+      }
+      pending_count_.push_back(num_control_edges + 1);
+
+    } else if (IsMerge(node_def)) {
       // Cycles in the graph are only allowed for while loops and recursion.
       // A while loop is identified by an edge from a NextIteration node to a Merge node.
-      // A recursion is identified by an edge from a NextCall Node to a Merge node
-      // For such Merge nodes, only wait for one non-control input before
-      // considering the node ready to process in Convert().
+      // A recursion is identified by an edge from a Call Node to a Merge node
+      // In recursion, function returning nodes also participate in a cycle
+      // For such Merge nodes, and for function returning nodes only wait for
+      // one non-control input before considering the node ready to process in Convert().
       int32 num_control_edges = 0;
       bool has_loop_back_edge = false;
       for (int i = 0; i < node_def.input_size(); ++i) {
@@ -461,17 +522,6 @@ Status GraphConstructor::InitFromEdges() {
       } else {
         pending_count_.push_back(node_def.input_size());
       }
-    } else if (IsReturn(node_def)) {
-      // Does not necessarily mean cycle though - maybe I should find a better condition
-      int32 num_control_edges = 0;
-      for (int i = 0; i < node_def.input_size(); ++i) {
-        StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
-          num_control_edges++;
-        }
-      }
-      pending_count_.push_back(num_control_edges);
-      ready_.push_back(n);
     } else {
       pending_count_.push_back(node_def.input_size());
     }
@@ -847,10 +897,10 @@ Status GraphConstructor::Convert() {
       inputs.push_back(InputInfo(id.first.ToString(), src_node, src_index));
     }
 
-    if (has_data_back_edge && !IsMerge(*node_def) && !IsReturn(*node_def)) {
+    if (has_data_back_edge && !IsMerge(*node_def) && !IsReturningNode(*node_def)) {
       return errors::InvalidArgument(
           "Node '", node_def->name(),
-          "' had a back edge, but only Merge and Return nodes can have back edges.");
+          "' had a back edge, but only Merge and returning nodes can have back edges.");
     }
 
     Node* node;
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 72b47cfd84..9434652a3a 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -407,7 +407,7 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
       if (func != nullptr) {
         FuncInfo func_info;
         functions_in.emplace(node.op(), func_info);
-        InlineFunction(node, *func, ctx, optimized_graph, functions_in, frame_name);
+        InlineFunction(node, *func, ctx, optimized_graph, functions_in, ++frame_name);
         functions_in.erase(node.op());      // At this point functions_in will be empty
 
         // Check if the function node corresponded to some fetch_outputs
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 86c56a2c6c..62344aca75 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -40,7 +40,7 @@ void TopologicalSort(GraphDef* graph) {
       ready_nodes.push_back(node);
       back++;
     }
-    bool recursion_merge = 0;
+    bool recursion_merge = false;
 
     if (IsMerge(*node)) {
       ready_inputs[node] = 0;
@@ -50,12 +50,12 @@ void TopologicalSort(GraphDef* graph) {
         }
         else if (IsCall(*output_map.GetNode(input))) {
           ready_inputs[node] ++;
-          recursion_merge = 1;
+          recursion_merge = true;
         }
       }
       if (recursion_merge) {
         ready_inputs[node]--;
-        recursion_merge = 0;
+        recursion_merge = false;
       }
 
     } else if (IsReturn(*node)) {
@@ -67,12 +67,14 @@ void TopologicalSort(GraphDef* graph) {
         // In order to detect the recursion cycles we depend on
         // the fact that a recursive function's returning node,
         // will be sending outputs to at least 2 "Return" nodes
-        // with different "frame_name" attributes (same "frame_name"
+        // with different "call_id" attributes (same "call_id"
         // attrs would mean that they belong in the same function call
         // but they correspond to different function outputs)
-        int call_id;
-        GetNodeAttr(AttrSlice(*node), "call_id", &call_id);
-        returning_nodes[prevNode].emplace(call_id);
+        if (!StringPiece(input).starts_with("^")) {
+          int call_id;
+          GetNodeAttr(AttrSlice(*node), "call_id", &call_id);
+          returning_nodes[prevNode].emplace(call_id);
+        }
       }
       ready_inputs[node] = 0;
 

From f47355af1d6aee779277f8d8cef9d6e951b5e7c1 Mon Sep 17 00:00:00 2001
From: DelphianCalamity <kelkost@yahoo.gr>
Date: Sat, 15 Sep 2018 19:04:39 +0300
Subject: [PATCH 63/64] ackermann

---
 TESTS/ackermann.py | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/TESTS/ackermann.py b/TESTS/ackermann.py
index bac81b2088..680e1a618a 100644
--- a/TESTS/ackermann.py
+++ b/TESTS/ackermann.py
@@ -1,21 +1,35 @@
 import tensorflow as tf
 from tensorflow.python.framework import function
 
-ack = function.Declare("ack", [("n", tf.int32), ("m", tf.int32)], [("ret", tf.int32)])
+ack = function.Declare("Ack", [("m", tf.int32), ("n", tf.int32)], [("ret", tf.int32)])
 
 @function.Defun(tf.int32, tf.int32, func_name="Ack", out_names=["ret"])
-def AckImpl(n,m):
-	return tf.cond(tf.equal(m, 0),
-        lambda: n + 1,
-        tf.cond(tf.equals(n, 0),
-            lambda: ack(m-1,1),
-            lambda: ack(m-1,ack(m,n-1))))
+def AckImpl(m,n):
+
+	def f1(): 
+		ret = n + 1
+		return ret
+
+	def f2():
+		def ff1():
+			r = ack(m-1,1)
+			return r
+
+		def ff2():
+			r = ack(m-1, ack(m, n-1))
+			return r
+
+		ret = tf.cond(tf.equal(n, 0), ff1, ff2)
+		return ret
+
+	return tf.cond(tf.equal(m, 0), f1, f2)
+
 
 AckImpl.add_to_graph(tf.get_default_graph())
 
 n = tf.placeholder(tf.int32, shape=[])
 m = tf.placeholder(tf.int32, shape=[])
-res = ack(n,m)
+res = ack(m,n)
 
 writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
 
@@ -23,7 +37,8 @@ def AckImpl(n,m):
 
 #print(tf.get_default_graph().as_graph_def())
 
-writer.close()
-print(sess.run(res, feed_dict={n:2, m:3}))
+print(sess.run(res, feed_dict={m:2, n:3}))
 
 sess.close()
+
+writer.close()

From 0497209572c636819613df5d2bc8a58a5d12ea70 Mon Sep 17 00:00:00 2001
From: Calliope Kostopoulou <DelphianCalamity@users.noreply.github.com>
Date: Mon, 17 Sep 2018 11:35:57 +0300
Subject: [PATCH 64/64] Distributed Runtime (#11)

---
 TESTS/create_worker.py                        |  13 +
 TESTS/distributed/distr_factorial.py          |  40 +
 TESTS/distributed/distr_fcallsg.py            |  44 ++
 TESTS/distributed/distr_fibonacci.py          |  39 +
 TESTS/distributed/distr_fog.py                |  36 +
 TESTS/distributed/distr_funcSimple.py         |  36 +
 TESTS/distributed/distr_mutrec.py             |  49 ++
 TESTS/fog.py                                  |  26 +
 tensorflow/core/common_runtime/executor.cc    |  70 +-
 .../common_runtime/graph_execution_state.cc   |   2 +-
 .../distributed_runtime/master_session.cc     |  48 ++
 tensorflow/core/graph/graph_partition.cc      | 686 +++++++++++++++++-
 .../optimizers/function_transformation.cc     |  47 +-
 13 files changed, 1089 insertions(+), 47 deletions(-)
 create mode 100644 TESTS/create_worker.py
 create mode 100644 TESTS/distributed/distr_factorial.py
 create mode 100644 TESTS/distributed/distr_fcallsg.py
 create mode 100644 TESTS/distributed/distr_fibonacci.py
 create mode 100644 TESTS/distributed/distr_fog.py
 create mode 100644 TESTS/distributed/distr_funcSimple.py
 create mode 100644 TESTS/distributed/distr_mutrec.py
 create mode 100644 TESTS/fog.py

diff --git a/TESTS/create_worker.py b/TESTS/create_worker.py
new file mode 100644
index 0000000000..62e80ade27
--- /dev/null
+++ b/TESTS/create_worker.py
@@ -0,0 +1,13 @@
+# Get task number from command line
+import sys
+task_number = int(sys.argv[1])
+
+import tensorflow as tf
+
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+server = tf.train.Server(cluster, job_name="local", task_index=task_number)
+
+print("Starting server #{}".format(task_number))
+
+server.start()
+server.join()
diff --git a/TESTS/distributed/distr_factorial.py b/TESTS/distributed/distr_factorial.py
new file mode 100644
index 0000000000..dfbf931b20
--- /dev/null
+++ b/TESTS/distributed/distr_factorial.py
@@ -0,0 +1,40 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+
+fac = function.Declare("Fac", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="Fac", out_names=["ret"])
+def FacImpl(n):
+
+	def f1(): 
+		with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+			ret = tf.constant(1)
+		return ret
+	def f2(): 
+		with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+			ret = n * fac(n - 1)
+		return ret
+
+	with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+		pred = tf.less_equal(n, 1)
+
+	return tf.cond(pred, f1, f2)
+
+FacImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+x = tf.add(n, 1)
+result = fac(x)
+y = tf.add(result, 1)
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session("grpc://localhost:2222") as sess:
+	print(sess.run(y, feed_dict={n: 5}))
+
+writer.close()
+
diff --git a/TESTS/distributed/distr_fcallsg.py b/TESTS/distributed/distr_fcallsg.py
new file mode 100644
index 0000000000..da241b6965
--- /dev/null
+++ b/TESTS/distributed/distr_fcallsg.py
@@ -0,0 +1,44 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+
+@function.Defun(tf.float32)
+def G(x):
+
+	with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+		ret = x + x	
+
+	return ret
+
+
+@function.Defun(tf.float32, tf.float32)
+def MyFunc(x, y):
+
+	with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+		g1 = G(x)
+		g2 = G(y)
+
+		ret = g1 + g2			
+
+	return ret
+
+
+# Building the graph.
+
+a = tf.constant([4.0], name="a")
+b = tf.placeholder(tf.float32, name="MyPlaceHolder")
+
+add = tf.add(a, b, name="add")
+sub = tf.subtract(a, b, name="sub")
+
+ret = MyFunc(add, sub, name='mycall')
+
+#x = tf.add(c, d)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session("grpc://localhost:2222") as sess:
+	print(sess.run([ret], feed_dict={b:1}))
+
+writer.close()
diff --git a/TESTS/distributed/distr_fibonacci.py b/TESTS/distributed/distr_fibonacci.py
new file mode 100644
index 0000000000..e8c3e59f88
--- /dev/null
+++ b/TESTS/distributed/distr_fibonacci.py
@@ -0,0 +1,39 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+
+fib = function.Declare("Fib", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="Fib", out_names=["ret"])
+def FibImpl(n):
+
+	def f1(): 
+		with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+			ret = tf.constant(1)
+		return ret
+	def f2(): 
+		with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+			fib1 = fib(n-1)
+		with tf.device("/job:local/replica:0/task:1/device:CPU:0"): 
+			fib2 = fib(n-2)
+		
+		return fib1 + fib2
+
+	return tf.cond(tf.less_equal(n, 1), f1, f2)
+
+FibImpl.add_to_graph(tf.get_default_graph())
+
+n = tf.placeholder(tf.int32, shape=[])
+x = fib(n)
+
+res = tf.add(x, 1)
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session("grpc://localhost:2222") as sess:
+	print(sess.run(res, feed_dict={n: 20}))
+
+writer.close()
diff --git a/TESTS/distributed/distr_fog.py b/TESTS/distributed/distr_fog.py
new file mode 100644
index 0000000000..430665a7de
--- /dev/null
+++ b/TESTS/distributed/distr_fog.py
@@ -0,0 +1,36 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+
+@function.Defun(tf.float32)
+def G(x):
+	with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+		add = x + 1
+	with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+		ret = x * add 
+	return ret
+
+@function.Defun(tf.float32)
+def F(x):
+	with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+		add = x + 1 
+	with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+		ret = x * add
+	return ret
+
+
+a = tf.constant([4.0], name="a")
+b = tf.placeholder(tf.float32, name="MyPlaceHolder")
+
+add = tf.add(a, b, name="add")
+
+ret = F(G(add), name='mycall')
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session("grpc://localhost:2222") as sess:
+	print(sess.run([ret], feed_dict={b:1}))
+
+writer.close()
+
diff --git a/TESTS/distributed/distr_funcSimple.py b/TESTS/distributed/distr_funcSimple.py
new file mode 100644
index 0000000000..1fbe935696
--- /dev/null
+++ b/TESTS/distributed/distr_funcSimple.py
@@ -0,0 +1,36 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+
+@function.Defun(tf.int32, tf.int32)
+def MyFunc(x, y):
+	
+	with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+		add1 = x + y
+
+	return [add1, x - y]
+
+
+# Building the graph.
+
+a = tf.constant([4], name="x")
+b = tf.placeholder(tf.int32, name="MyPlaceHolder")
+
+with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+	add = tf.add(a, b, name="add")
+
+with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+	sub = tf.subtract(a, b, name="sub")
+
+[c,d] = MyFunc(add, sub, name='mycall')
+
+x = tf.add(c, d)
+
+#print(tf.get_default_graph().as_graph_def())
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session("grpc://localhost:2222") as sess:
+	print(sess.run([x], feed_dict={b:1}))
+writer.close()
diff --git a/TESTS/distributed/distr_mutrec.py b/TESTS/distributed/distr_mutrec.py
new file mode 100644
index 0000000000..864809e57e
--- /dev/null
+++ b/TESTS/distributed/distr_mutrec.py
@@ -0,0 +1,49 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223"]})
+
+f = function.Declare("F", [("n", tf.int32)], [("ret", tf.int32)])
+g = function.Declare("G", [("n", tf.int32)], [("ret", tf.int32)])
+
+@function.Defun(tf.int32, func_name="F", out_names=["ret"])
+def FImpl(n):
+
+	def f1(): 
+		with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+			ret = tf.constant(1)
+		return ret
+	def f2(): 
+		with tf.device("/job:local/replica:0/task:0/device:CPU:0"):
+			x = n - 1
+			ret = g(x)
+		return ret
+
+#	with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+	pred = tf.less_equal(n, 1)
+
+	return tf.cond(pred, f1, f2)
+
+
+@function.Defun(tf.int32, func_name="G", out_names=["ret"])
+def GImpl(n):
+
+	with tf.device("/job:local/replica:0/task:1/device:CPU:0"):
+		x = n - 1
+		ret = f(x)
+	return ret
+
+
+FImpl.add_to_graph(tf.get_default_graph())
+GImpl.add_to_graph(tf.get_default_graph())
+
+
+n = tf.placeholder(tf.int32, name="MyPlaceHolder")
+x = f(n)
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session("grpc://localhost:2222") as sess:
+	print(sess.run([x], feed_dict={n:4}))
+
+writer.close()
diff --git a/TESTS/fog.py b/TESTS/fog.py
new file mode 100644
index 0000000000..f6a21e7a8f
--- /dev/null
+++ b/TESTS/fog.py
@@ -0,0 +1,26 @@
+import tensorflow as tf
+from tensorflow.python.framework import function
+
+@function.Defun(tf.float32)
+def G(x):
+	return x * x
+
+
+@function.Defun(tf.float32)
+def F(x):
+	return x + x
+
+
+a = tf.constant([4.0], name="a")
+b = tf.placeholder(tf.float32, name="MyPlaceHolder")
+
+add = tf.add(a, b, name="add")
+
+ret = F(G(add), name='mycall')
+
+writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
+
+with tf.Session() as sess:
+	print(sess.run([ret], feed_dict={b:1}))
+
+writer.close()
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 00f69a0a41..ed3a889ee0 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1430,7 +1430,6 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
         parent = parent_nodes[call_node_id];
         frame_name = cf_info->frame_names[call_node_id];
       } else {
-        // is this even possible (encounter a Return before a Call) ??
         ready.push_back(curr_node);
         continue;
       }
@@ -1443,6 +1442,8 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
       Node* out = out_edge->dst();
       const int out_id = out->id();
 
+      if (IsReturn(out) && out_edge->IsControlEdge()) continue;
+
       // Add to ready queue if not visited.
       bool is_visited = visited[out_id];
       if (!is_visited) {
@@ -1990,6 +1991,9 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
     VLOG(2) << "Frame: " << input_frame->frame_name;
   }
 
+  printf("Propagate Outputs: %s,  am i alive? %d\n", node->name().c_str(), !is_dead);
+  printf("Frame: %s\n", input_frame->frame_name.c_str());
+
   if (!item->is_enter_exit_or_next_iter && !item->is_call_or_return) {
     // Fast path for nodes types that don't need special handling
     DCHECK_EQ(input_frame, output_frame);
@@ -2037,32 +2041,32 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
                                                            input_iter, ready);
     }
   } else if (item->is_call) {
-    if (is_dead) {
-      // Stop the deadness propagation.
-      output_frame = nullptr;
-    } else {
-      FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
-      output_iter = 0;
-      {
-        const NodeItem *item = impl_->gview_.node(node->id());
-        mutex_lock l(output_frame->mu);
-        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-        output_frame->num_pending_inputs--;
-      }
+//    if (is_dead) {
+//      // Stop the deadness propagation.
+//      output_frame = nullptr;
+//    } else {
+    FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
+    output_iter = 0;
+    {
+      const NodeItem *item = impl_->gview_.node(node->id());
+      mutex_lock l(output_frame->mu);
+      output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+      output_frame->num_pending_inputs--;
     }
+//    }
     is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
   } else if (item->is_return) {
-    if (is_dead) {
-      // Stop the deadness propagation.
-      output_frame = nullptr;
-    } else {
-      output_frame = input_frame->parent_frame;
-      output_iter = input_frame->parent_iter;
-      {
-        mutex_lock l(output_frame->mu);
-        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-      }
+//    if (is_dead) {
+//      // Stop the deadness propagation.
+//      output_frame = nullptr;
+//    } else {
+    output_frame = input_frame->parent_frame;
+    output_iter = input_frame->parent_iter;
+    {
+      mutex_lock l(output_frame->mu);
+      output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
     }
+//    }
     is_frame_done = input_frame->DecrementOutstandingOps(&impl_->gview_, input_iter, ready);
   } else {
     DCHECK(IsNextIteration(node));
@@ -2577,8 +2581,23 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
       int pending, dead;
       iter_state->adjust_for_activation(dst_pending_id, increment_dead,
                                         &pending, &dead);
-      dst_dead = (dead > 0);
-      dst_ready = (pending == 0);
+
+
+      if (dst_item->is_return && increment_dead) {
+        // The only dead input a Return op will ever may get
+        // is the control input propagated to it from a corresponding
+        // dead Call op in case of untaken branch. So at this point
+        // we are certain that Return op will never receive another input.
+        // Therefore, we force it to be added in queue for the sake of
+        // deadness propagation and we adjust it for activation once more,
+        // so that it no longer waits for another (never coming) input.
+        iter_state->adjust_for_activation(dst_pending_id, increment_dead,
+                                          &pending, &dead);
+      }
+
+        dst_dead = (dead > 0);
+        dst_ready = (pending == 0);
+
     }
 
     if (dst_need_input) {
@@ -2593,6 +2612,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
 
     // Add dst to the ready queue if it's ready
     if (dst_ready) {
+      printf("    Add in queue: %s\n", dst_item->node->name().c_str());
       if (dst_item->is_control_trigger) dst_dead = false;
       ready->push_back(TaggedNode(dst_item->node, this, iter, dst_dead));
       iter_state->outstanding_ops++;
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 7ace622237..7d09fe4f59 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -363,7 +363,7 @@ Status GraphExecutionState::OptimizeGraph(
     optimized_graph->reset(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(
         ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get()));
-/*******************************************************************************************
+/*******************************************************************************************/
     // Write an event, so that we can visualize this optimized graph in tensorboard
     EventsWriter writer("Fully_Optimized");
     Event event;
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 995422644a..d75ec3f644 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -19,6 +19,10 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/util/event.pb.h"
+#include "tensorflow/core/util/events_writer.h"
+
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
@@ -280,6 +284,39 @@ Status MasterSession::ReffedClientGraph::RegisterPartitions(
       std::unordered_map<string, GraphDef> graph_defs;
       Status s = DoBuildPartitions(popts, &graph_defs);
       if (s.ok()) {
+
+
+printf("\n\n MASTER PARTITIONS:\n");
+int i=0;
+for (const auto& it: graph_defs) {
+    string dvc = it.first;
+    const GraphDef* graphDef = &it.second;
+    printf("\n\nDeviceName :'%s'\n", dvc.c_str());
+    printf("Partition GraphDef:\n %s\n", SummarizeGraphDef(*graphDef).c_str());
+
+  string p = strings::StrCat("Partition", i); i++;
+  EventsWriter writer(p);
+  Event event;
+  event.set_wall_time(1234);
+  event.set_step(34);
+
+  const size_t proto_size = graphDef->ByteSizeLong();
+  void* buf = port::Malloc(proto_size);
+  if (buf == nullptr) {
+    return errors::ResourceExhausted(
+            "Failed to allocate memory to serialize message of type '" ,
+            graphDef->GetTypeName(), "' and size ", proto_size);
+  }
+  graphDef->SerializeToArray(buf, proto_size);
+  const void* bf = buf;
+  event.set_graph_def(bf, proto_size);
+  writer.WriteEvent(event);
+
+}
+
+
+
+
         // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
         // valid after the call to DoRegisterPartitions begins, so
         // `stats_publisher_` must make a copy if it wants to retain the
@@ -1543,9 +1580,20 @@ Status MasterSession::DoRunWithLocalExecution(
     pss.collect_rpcs = ph->should_collect_rpcs();
   }
 
+//   For future "execution-time" testing - when run on truly seperate machines
+//  clock_t t;
+//  t = clock();
+
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
                                 &cancellation_manager_, false);
   if (s.ok()) {
+
+//
+//    t = clock() - t;
+//    std::cout << "time: " << t << " miliseconds" << std::endl;
+//    std::cout << "time: " << t*1.0/CLOCKS_PER_SEC << " seconds" << std::endl;
+
+
     pss.end_micros = Env::Default()->NowMicros();
 
     // Schedule post-processing and cleanup to be done asynchronously.
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 71d8cdd6ab..2744be9077 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/util/event.pb.h"
+#include "tensorflow/core/util/events_writer.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -38,6 +42,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
+
+#include "tensorflow/core/framework/graph_def_util.h"
+
 namespace tensorflow {
 
 namespace {
@@ -931,20 +938,681 @@ void SetIncarnation(const PartitionOptions& opts, GraphDef* gdef) {
   }
 }
 
+
+/**************************************************************************************************/
+
+struct StateMachineNodeInput {
+    string src;
+    int index;
+};
+
+struct StateMachineParent {
+    Node* parent_node;
+    int parent_index;
+};
+
+struct StateMachineNode {
+  Node* node;
+  std::vector<StateMachineNodeInput> inputs;
+};
+
+struct StateMachineGraph {
+  std::unordered_map<string, StateMachineNode*> nodes;
+  std::set<string> depends_on;
+  Node* merge;
+};
+
+struct StateMachine {
+  // A map from unique_ids to StateMachineGraphs representing a general dynamic
+  // state machine that we update every time a function gets called, and helps us
+  // gradually build the state machines of the partitions
+  std::unordered_map<string, StateMachineGraph*> state_machine_graphs;
+  // state_machine_parents is the 'spine' of the graph,
+  // containing only control flow nodes
+  std::vector<StateMachineParent> state_machine_parents;
+
+  std::unordered_map<Node*, Node*> switches_info;
+  // <switch_node, predicate_node>
+  std::unordered_map<Node*, Node*> switchToPred;
+
+  string leader_partition;
+
+  // Maps device names to smaller strings
+  std::unordered_map<string, string> device_names_map;
+
+  std::unordered_map<string, std::set<string>*> partitionsToSMG;
+};
+
+struct FuncInfo {
+  // A map from <frame_name> to the num of function's arguments
+  std::unordered_map<string, int> funcInputs;
+  // Helps us seperate functions with same frame_name but
+  // different non recursive call sites
+  std::unordered_map<string, int> funcVisitedCounter;
+  // Εach vector<Node*> below operates as a barrier,
+  // we don't call CallingFunction(..) before we gather
+  // all function's arguments/calls first
+  std::unordered_map<int, std::vector<Node*>*> funcCalls;
+};
+
+// Adds root nodes into ready_nodes queue and sets ready_inputs appropriately
+void PreprocessGraph(std::unordered_map<const Node*, int> &ready_inputs, Graph* g,
+                     std::deque<Node*> &ready_nodes) {
+
+  std::unordered_map<const Node*, std::set<int>> returning_nodes;
+
+  for (Node* node : g->nodes()) {
+
+    if (node->in_edges().empty()) {
+      ready_nodes.push_back(node);
+    }
+    bool recursion_merge = 0;
+    if (IsMerge(node)) {
+      ready_inputs[node] = 0;
+      for (const Edge* in_edge : node->in_edges()) {
+
+        Node* in = in_edge->src();
+        // if (IsNextIteration(*output_map.GetNode(input))) {
+        //   ready_inputs[node]++;
+        // }
+        if (IsCall(in)) {
+          ready_inputs[node]++;
+          recursion_merge = 1;
+        }
+      }
+      if (recursion_merge) {
+        ready_inputs[node]--;
+        recursion_merge = 0;
+      }
+
+    } else if (IsReturn(node)) {
+
+      for (const Edge* in_edge : node->in_edges()) {
+        Node* in = in_edge->src();
+
+        if (!in_edge->IsControlEdge()) {
+          int call_id;
+          GetNodeAttr(node->attrs(), "call_id", &call_id);
+          returning_nodes[in].emplace(call_id);
+        }
+      }
+      ready_inputs[node] = 0;
+
+    } else {
+      ready_inputs[node] = 0;
+    }
+  }
+
+  for (const auto& retnode : returning_nodes) {
+    if (retnode.second.size() > 1) {
+      // Detected Cycle
+      ready_inputs[retnode.first]++;
+    }
+  }
+}
+
+string GetDeviceMappedName(StateMachine &state_machine, string device_name) {
+
+  std::unordered_map<string, string>& device_map = state_machine.device_names_map;
+
+  auto slot = &device_map[device_name];
+  if (*slot == "")
+    *slot = strings::StrCat("_p", device_map.size() + 1);
+  return *slot;
+}
+
+bool IsCallSuccessor(Node* node) {
+
+  for (const Edge* in_edge : node->in_edges()) {
+    Node* src = in_edge->src();
+    if (IsCall(src) && !in_edge->IsControlEdge())
+      return true;
+  }
+  return false;
+}
+
+void DeleteStateMachineGraph(StateMachine& state_machine, string unique_id) {
+
+  StateMachineGraph *smg = state_machine.state_machine_graphs[unique_id];
+
+  for (auto& it : smg->nodes)
+    delete it.second;
+  delete smg;
+}
+
+std::vector<Node*>* GetOrCreateCalls(int call_id, std::unordered_map<int,std::vector<Node*>*> &funcCalls) {
+    auto slot = &funcCalls[call_id];
+    if (*slot == nullptr)
+      *slot = new std::vector<Node*>;
+    return *slot;
+}
+
+std::set<string>* GetOrCreatePartition(string partition, std::unordered_map<string, std::set<string>*> &partsTpSmg) {
+  auto slot = &partsTpSmg[partition];
+  if (*slot == nullptr)
+    *slot = new std::set<string>;
+  return *slot;
+}
+
+// For one if-else construction there are more than one Switch nodes guarding all the inputs
+// that are needed inside the branches but live outside of them. We need to collect all the Switch
+// nodes that correspond to one if-else construction and treat them as one in the state machines
+// switches_info: Every switch node maps to the original switch that we "ll take into account
+void CollectSwitches(Graph* g, StateMachine& state_machine) {
+
+  std::unordered_map<Node*, Node*> pred_switch;
+
+  for (Node *node : g->nodes()) {
+
+    if (IsSwitch(node)) {
+
+      for (const Edge *in_edge : node->in_edges()) {
+
+        int port = in_edge->dst_input();
+
+        // A sloppy way to determine if this is the predicate input
+        if (!in_edge->IsControlEdge() && port == 1) {
+
+          Node *predicate = in_edge->src();
+
+          while (IsIdentity(predicate)) {
+            for (const Edge *inEdge : predicate->in_edges()) {
+              if (!inEdge->IsControlEdge()) {
+                predicate = inEdge->src();
+                break;
+              }
+            }
+          }
+
+          // We 've got the real predicate
+          Node *switchNode;
+          if (pred_switch.find(predicate) == pred_switch.end()) {
+            // Original switch
+            pred_switch[predicate] = node;
+            state_machine.switchToPred[node] = predicate;
+            switchNode = node;
+          } else {
+            // "Synonym" switch
+            switchNode = pred_switch[predicate];
+          }
+
+          state_machine.switches_info[node] = switchNode;
+
+          break;
+        }
+      }
+      printf("Switch : %s -> %s\n", node->name().c_str(), state_machine.switches_info[node]->name().c_str());
+    }
+  }
+
+  printf("\n\n\n");
+}
+
+void GatherPartitionStateMachines(StateMachine& state_machine, std::set<string>* smgs) {
+
+  std::deque<string> queue;
+
+  for (auto& it : *smgs)
+    queue.push_back(it);
+
+  while (!queue.empty()) {
+    string smg = queue.front();
+    queue.pop_front();
+
+    StateMachineGraph* sm_graph = state_machine.state_machine_graphs[smg];
+    for (auto& it : sm_graph->depends_on) {
+      // If not already visited
+      if (smgs->find(it) == smgs->end()) {
+        smgs->emplace(it);
+        queue.push_back(it);
+      }
+    }
+  }
+}
+
+NodeDef* FindNodeInGraphDef(GraphDef& graphDef, string node_name) {
+
+  for (NodeDef& nodeDef : *graphDef.mutable_node()) {
+    if (nodeDef.name() == node_name)
+      return &nodeDef;
+  }
+  return nullptr;
+}
+
+void ConnectMergeToNode(GraphDef& graphDef, string merge_name, string node_name,
+        StateMachine& state_machine, string partition_name) {
+
+  // We can safely infer the correct Merge's name and add it as control input to the node
+  // even though partition state machine's Merge has not already been added into graphdef
+  string suffix;
+  (partition_name != state_machine.leader_partition) ?
+  (suffix = GetDeviceMappedName(state_machine, partition_name)) : (suffix = "");
+
+  //Add as control input
+  NodeDef* node = FindNodeInGraphDef(graphDef, node_name);
+  *node->add_input() = strings::StrCat("^", merge_name, suffix);
+}
+
+void AddPartitionStateMachine(StateMachine& state_machine, GraphDef& main_graphDef,
+        string unique_id, string partition) {
+
+  StateMachineGraph *sm_graph = state_machine.state_machine_graphs[unique_id];
+  string suffix = GetDeviceMappedName(state_machine, partition);
+  for (const auto &it : sm_graph->nodes) {
+    string node_name = it.first;
+    StateMachineNode *sm_node = it.second;
+    Node *node = sm_node->node;
+
+    // Build NodeDef
+    NodeDef *nodedef = main_graphDef.add_node();
+    //Note: suffix does not guarantee that name is unique
+    nodedef->set_name(strings::StrCat(node_name, suffix));
+    nodedef->set_op(node->op_def().name());
+    nodedef->set_device(partition);
+
+    // Add Inputs
+    for (int i = 0; i < sm_node->inputs.size(); ++i) {
+      // There won't exist any control inputs here
+      nodedef->add_input(strings::StrCat(sm_node->inputs[i].src, suffix, ":", sm_node->inputs[i].index));
+
+      if (StringPiece(sm_node->inputs[i].src).starts_with("Dummy_")) {
+        Tensor tensor(DT_INT32, TensorShape({0}));
+        NodeDef* dummy = main_graphDef.add_node();
+        dummy->set_name(strings::StrCat(sm_node->inputs[i].src, suffix));
+        dummy->set_op("Const");
+        dummy->set_device(partition);
+        AddNodeAttr("dtype", DT_INT32, dummy);
+        AddNodeAttr("value", tensor, dummy);
+      }
+    }
+
+    if (IsSwitch(node)) {
+      // Add predicate input too
+      nodedef->add_input(state_machine.switchToPred[node]->name());
+      // Add control input from partition's Merge to partition's Switch
+      nodedef->add_input(strings::StrCat("^", sm_graph->merge->name(), suffix));
+    }
+
+    for (const auto &itt : node->def().attr()) {
+      // Not sure if this is copying attrs correctly
+      if (itt.first == "T") {
+        // We don't care about keeping the original "T" attr
+        // in state machine nodes
+        AddNodeAttr(itt.first, DT_INT32, nodedef);
+      } else
+        AddNodeAttr(itt.first, itt.second, nodedef);
+    }
+  }
+}
+
+void AddNodeToStateMachine(StateMachine& state_machine, string unique_id, Node* node, bool cycle) {
+
+  StateMachineGraph *smg = state_machine.state_machine_graphs[unique_id];
+  StateMachineNode *smn = new StateMachineNode;
+
+  smn->node = node;
+
+  StateMachineParent *parent = &state_machine.state_machine_parents[node->id()];
+
+  if (parent->parent_node == nullptr) {
+    int call_id;
+    GetNodeAttr(node->attrs(), "call_id", &call_id);
+    smn->inputs.push_back({strings::StrCat("Dummy_", call_id), 0});
+  } else
+    smn->inputs.push_back({parent->parent_node->name(), parent->parent_index});
+
+  smg->nodes[node->name()] = smn;
+
+  // If cycle is true, node is a recursive call, that needs to be added as
+  // input to the corresponding Merge node
+  if (cycle) {
+    // We traverse graph the way topological sort does, so we will never
+    // meet a recursive call node before its corresponding Merge
+    StateMachineNode* merge = smg->nodes[smg->merge->name()];
+    merge->inputs.push_back({node->name(), 0});
+  }
+}
+
+void CallingFunction(Graph* graph, GraphDef& main_graphDef, StateMachine& state_machine, FuncInfo& funcInfo,
+                    string function_frame_name, int function_call_id,
+                    std::unordered_map<const Node*, int>& ready_inputs,
+                    std::deque<Node*>& prev_ready_nodes) {
+
+  Node *merge, *call;
+  std::deque<Node*> ready_nodes;
+
+  string function_unique_id = strings::StrCat(function_frame_name, ":",
+          funcInfo.funcVisitedCounter[function_frame_name]);
+
+  std::vector<Node*>* calls = funcInfo.funcCalls[function_call_id];
+  for (int i=0; i < calls->size(); ++i) {
+    ready_nodes.push_back((*calls)[i]);
+  }
+  call = (*calls)[0];
+
+  // We add only one Call node for all possible function's args in the state machine
+  AddNodeToStateMachine(state_machine, function_unique_id, call, false);
+
+  std::vector<StateMachineParent>& state_machine_parents = state_machine.state_machine_parents;
+  StateMachineGraph* sm_graph = state_machine.state_machine_graphs[function_unique_id];
+
+  // Call's successor (the non control output) will be either
+  // a Merge node (in case of recursion) or an Identity node.
+  // Either way we add that successor to the state machine, too.
+  // Same as above, we add only one Merge node instead of one per function's arg
+  for (const Edge* out_edge : call->out_edges()) {
+    if (!out_edge->IsControlEdge()) {
+      merge = out_edge->dst();
+      state_machine_parents[merge->id()].parent_node = call;
+      state_machine_parents[merge->id()].parent_index = 0;
+      AddNodeToStateMachine(state_machine, function_unique_id, merge, false);
+      sm_graph->merge = merge;
+      break;
+    }
+  }
+
+  while (!ready_nodes.empty()) {
+
+    Node* ready_node = ready_nodes.front();
+    ready_nodes.pop_front();
+
+    int parent_index = 0;
+    Node* parent = state_machine_parents[ready_node->id()].parent_node;
+
+    // The ops below need to update the parent
+    if (IsCall(ready_node)) {
+      parent = call;
+    } else if (IsCallSuccessor(ready_node)) {
+      parent = merge;
+    } else if (IsSwitch(ready_node)) {
+      Node *sw = state_machine.switches_info[ready_node];
+      if (sw == ready_node)
+        AddNodeToStateMachine(state_machine, function_unique_id, ready_node, false);
+      parent = sw;
+    } else if (IsMerge(ready_node)) {
+      // Control Flow (regular) Merge has a corresponding Switch node
+      // Parent gets the value of that switch node's parent
+      parent = state_machine_parents[parent->id()].parent_node;
+      parent_index = state_machine_parents[parent->id()].parent_index;
+    } else if (IsReturn(ready_node)) {
+      // Return needs to propagate its corresponding Call's parent to all its successors
+      for (const Edge* in_edge : ready_node->in_edges()) {
+        if (in_edge->IsControlEdge()) {
+          Node* call_node = in_edge->src();
+          parent = state_machine_parents[call_node->id()].parent_node;
+          parent_index = state_machine_parents[call_node->id()].parent_index;
+          break;
+        }
+      }
+      int call_id;
+      GetNodeAttr(ready_node->attrs(), "call_id", &call_id);
+      // If not a 'recursive' return
+      if (call_id == function_call_id) {
+        // Add the successors of Return node to prev_ready_nodes queue
+        prev_ready_nodes.push_back(ready_node);
+        // Set the parent value of the only actual output of return
+        for (const Edge* out_edge : ready_node->out_edges()) {
+          Node* out = out_edge->dst();
+          state_machine_parents[out->id()].parent_node = parent;
+          state_machine_parents[out->id()].parent_index = parent_index;
+          break;
+        }
+        continue;
+      }
+    }
+
+    // Process ready_node's outputs
+    for (const Edge* out_edge : ready_node->out_edges()) {
+      Node* out = out_edge->dst();
+
+      ready_inputs[out]++;
+
+      // For a cross-device edge, on the dst device, add a control edge
+      // from the merge node of the state machine to dst. If a send/recv is
+      // introduced for this edge in future partitioning, we delete this
+      // control edge and add a new control edge from the merge to the recv.
+      const string& src_device = ready_node->assigned_device_name();
+      const string& dst_device = out->assigned_device_name();
+      if (src_device != dst_device) {
+        if (IsCallSuccessor(ready_node) && IsConstant(out)) {
+          // Remove this control edge that ensures constant executes in the same frame,
+          // and add a new one from the Constant's partition's state machine merge to the constant
+          NodeDef* con_node = FindNodeInGraphDef(main_graphDef, out->name());
+          for (string& input : *con_node->mutable_input()) {
+            if (StringPiece(input).starts_with(strings::StrCat("^", ready_node->name()))) {
+              string suffix = GetDeviceMappedName(state_machine, dst_device);
+              input = strings::StrCat("^", merge->name(), suffix);
+              break;
+            }
+          }
+        } else
+            ConnectMergeToNode(main_graphDef, merge->name(), out->name(), state_machine, dst_device);
+      }
+
+      if (ready_inputs[out] == out->in_edges().size()) {
+
+        if (IsSwitch(ready_node)) {
+          // We need to fix parent_index appropriately
+          parent_index = out_edge->src_output();
+        }
+
+        // Set node's parent
+        state_machine_parents[out->id()].parent_node = parent;
+        state_machine_parents[out->id()].parent_index = parent_index;
+
+        std::unordered_map<string, StateMachineGraph*>& sm_graphs = state_machine.state_machine_graphs;
+
+        if (IsCall(out)) {
+
+          string frame_name;
+          GetNodeAttr(out->attrs(), "frame_name", &frame_name);
+          int call_id;
+          GetNodeAttr(out->attrs(), "call_id", &call_id);
+
+          std::vector<Node*>* calls = GetOrCreateCalls(call_id, funcInfo.funcCalls);
+          calls->push_back(out);
+
+          if (funcInfo.funcInputs[frame_name] == calls->size()) {
+
+            // We gathered all function's inputs
+
+            string unique_id = strings::StrCat(frame_name, ":", funcInfo.funcVisitedCounter[frame_name]);
+
+            if (sm_graphs.find(unique_id) == sm_graphs.end()) {
+
+              sm_graphs.emplace(unique_id, new StateMachineGraph);
+              CallingFunction(graph, main_graphDef, state_machine, funcInfo, frame_name, call_id, ready_inputs, ready_nodes);
+              funcInfo.funcVisitedCounter[frame_name]++;
+            } else {
+              // Recursive Call (either to the same function or another one (mutual recursion)
+              AddNodeToStateMachine(state_machine, unique_id, (*calls)[0], true);
+              // Add the recursive call nodes to ready_nodes
+              for (int i=0; i < calls->size(); ++i)
+                ready_nodes.push_back((*calls)[i]);
+            }
+
+            sm_graphs[unique_id]->depends_on.emplace(function_unique_id);
+          }
+        } else {
+          GetOrCreatePartition(dst_device, state_machine.partitionsToSMG)->emplace(function_unique_id);
+          ready_nodes.push_back(out);
+        }
+      }
+    }
+  }
+}
+
+Status AddFunctionStateMachines(const PartitionOptions& opts,
+                            Graph* g, GraphDef& main_graphDef, GraphInfo* g_info) {
+
+  Status status;
+  GraphDefBuilder::Options bopts(g, &status);
+
+  FuncInfo funcInfo;
+  int nodes_num = g->num_node_ids();
+
+  const FunctionDefLibrary& fdef = opts.flib_def->ToProto();
+  for (const FunctionDef& func : fdef.function()) {
+
+      int num_inputs = func.signature().input_arg_size();
+      string name = func.signature().name();
+      funcInfo.funcInputs[name] = num_inputs;
+      funcInfo.funcVisitedCounter[name] = 0;
+  }
+
+  StateMachine state_machine;
+  state_machine.state_machine_parents.resize(nodes_num);
+
+  CollectSwitches(g, state_machine);
+
+  // Add all state machines for cross-device frames.
+  // A state machine is added only when there is a cross-device edge in a
+  // non-root frame.
+
+  // Visit nodes the way topological sort does
+  std::deque<Node*> ready_nodes;
+  std::unordered_map<const Node*, int> ready_inputs;
+
+  PreprocessGraph(ready_inputs, g, ready_nodes);
+
+  // We convert graph to its equivalent graph_def, cause it's easier
+  // to extend it with the GraphDef state machines of partitions
+  g->ToGraphDef(&main_graphDef);
+
+  while (!ready_nodes.empty()) {
+    Node* ready_node = ready_nodes.front();
+    ready_nodes.pop_front();
+
+    for (const Edge* out_edge : ready_node->out_edges()) {
+      Node* out = out_edge->dst();
+
+      ready_inputs[out]++;
+
+      if (ready_inputs[out] == out->in_edges().size()) {
+
+        if (IsCall(out)) {
+          string frame_name;
+          GetNodeAttr(out->attrs(), "frame_name", &frame_name);
+          int call_id;
+          GetNodeAttr(out->attrs(), "call_id", &call_id);
+
+          std::vector<Node*>* calls = GetOrCreateCalls(call_id, funcInfo.funcCalls);
+          calls->push_back(out);
+
+          if (funcInfo.funcInputs[frame_name] == calls->size()) {
+
+            string unique_id = strings::StrCat(frame_name, ":", funcInfo.funcVisitedCounter[frame_name]);
+
+            // We gathered all function's inputs
+            state_machine.leader_partition = out->assigned_device_name();
+            state_machine.state_machine_graphs.emplace(unique_id, new StateMachineGraph);
+            CallingFunction(g, main_graphDef, state_machine, funcInfo, frame_name, call_id, ready_inputs, ready_nodes);
+            funcInfo.funcVisitedCounter[frame_name]++;
+
+            // Adding partition state machines to graph
+            for (auto& it: state_machine.partitionsToSMG) {
+              string partition = it.first;
+
+              // Leader Partition already has its state machine
+              if (partition == state_machine.leader_partition)
+                continue;
+
+              std::set<string>* smgs = it.second;
+
+              // Collect all the state machine graphs that smgs depened on
+              GatherPartitionStateMachines(state_machine, smgs);
+
+              for (auto& it : *smgs)
+                AddPartitionStateMachine(state_machine, main_graphDef, it, partition);
+            }
+
+            // Deallocate space
+            for (auto& it : state_machine.partitionsToSMG)
+              delete it.second;
+            state_machine.partitionsToSMG.clear();
+
+            for (auto& it: state_machine.state_machine_graphs)
+              DeleteStateMachineGraph(state_machine, it.first);
+            state_machine.state_machine_graphs.clear();
+          }
+        } else
+          ready_nodes.push_back(out);
+      }
+    }
+  }
+
+  // Deallocate space
+  for (auto& it : funcInfo.funcCalls)
+    delete it.second;
+
+/****************************************************************************/
+  printf("\n\nSummarize Main Graph\n %s\n", SummarizeGraphDef(main_graphDef).c_str());
+  // Write an event, so that we can visualize this optimized graph in tensorboard
+  EventsWriter writer("Full_Partitioned");
+  Event event;
+  event.set_wall_time(1234);
+  event.set_step(34);
+
+  const size_t proto_size = main_graphDef.ByteSizeLong();
+  void* buf = port::Malloc(proto_size);
+  if (buf == nullptr) {
+    return errors::ResourceExhausted(
+            "Failed to allocate memory to serialize message of type '" ,
+            main_graphDef.GetTypeName(), "' and size ", proto_size);
+  }
+  main_graphDef.SerializeToArray(buf, proto_size);
+  const void* bf = buf;
+  event.set_graph_def(bf, proto_size);
+  writer.WriteEvent(event);
+/****************************************************************************/
+
+  return Status::OK();
+}
+
+
+
+/**************************************************************************************************/
+
+
 Status Partition(const PartitionOptions& opts, Graph* g,
                  std::unordered_map<string, GraphDef>* partitions) {
   Status status;
   partitions->clear();
 
   GraphInfo g_info;
+  std::unique_ptr <Graph> new_g(new Graph(OpRegistry::Global()));
+
   if (!opts.control_flow_added) {
     // Add the "code" for distributed execution of control flow. Code is
     // added only for the frames that are placed on multiple devices. The
     // new graph is an equivalent transformation of the original graph and
     // has the property that it can be subsequently partitioned arbitrarily
     // (down to the level of individual device) for distributed execution.
-    status = AddControlFlow(opts, g, &g_info);
+    GraphDef main_graphDef;
+    g->ToGraphDef(&main_graphDef);
+    printf("\n\nSummarize Main Graph:\n %s\n\n", SummarizeGraphDef(main_graphDef).c_str());
+
+    status =  AddControlFlow(opts, g, &g_info);
     if (!status.ok()) return status;
+
+    GraphDef gdef;
+    status = AddFunctionStateMachines(opts, g, gdef, &g_info);
+    if (status.ok()) {
+      // Convert GraphDef back to Graph so it can be partitioned
+      GraphConstructorOptions gopts;
+      gopts.allow_internal_ops = true;
+      TF_RETURN_IF_ERROR(
+              ConvertGraphDefToGraph(gopts, gdef, new_g.get()));
+      g = new_g.get();
+
+      // The graph conversion sets the requested device names but not the assigned
+      // device names. However, since at this point the graph is placed TF expects
+      // an assigned device name for every node. Therefore we copy the requested
+      // device into the assigned device field.
+      for (Node* node : g->nodes()) {
+        node->set_assigned_device_name(node->requested_device());
+      }
+    } else return status;
   }
 
   // At this point, all the graph mutations have been done. Build memory
@@ -994,7 +1662,19 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     int32 num_input_edges = 0;
     for (const Edge* edge : dst->in_edges()) {
       if (edge->IsControlEdge()) {
-        if (IsMerge(edge->src()) && IsControlLoop(edge->src())) {
+        if ((IsMerge(edge->src()) && IsControlLoop(edge->src())) ||
+                (IsCallSuccessor(edge->src()) && (!IsConstant(edge->dst()) ||
+                        edge->dst()->in_edges().size() > 1))) {
+          // Note: not all <CallSuccessor(..), Node> control edges are control flow edges.
+          // There are also <CallSuccessor(..), Constant> control edges added in
+          // FunctionTransformation for ensuring that Constants will execute in the
+          // correct 'frame'.
+          // We made sure in AddFunctionsStateMachines that:
+          // if a Constant in partition A, has such incoming edge from a CallSuccessor(..)
+          // node, then this node will definitely belong in the same A partition, so we
+          // can safely add those edges in "inputs" as we do with common control edges.
+          // All the other edges whose src node is a CallSuccessor node are control flow edges.
+
           // This is one of the control edges added for control flow. There
           // can be multiple such edges as the dest node may have multiple
           // remote inputs. We keep track of the number of such edges.
@@ -1102,7 +1782,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
 
       NodeDef* real_recv = nullptr;
       NodeDef* recv =
-          AddRecv(opts, g_info, dst_graph, edge, &real_recv, &status);
+              AddRecv(opts, g_info, dst_graph, edge, &real_recv, &status);
       if (!status.ok()) return status;
 
       // Fix up the control flow edge.
diff --git a/tensorflow/core/grappler/optimizers/function_transformation.cc b/tensorflow/core/grappler/optimizers/function_transformation.cc
index 9434652a3a..46dd00825a 100644
--- a/tensorflow/core/grappler/optimizers/function_transformation.cc
+++ b/tensorflow/core/grappler/optimizers/function_transformation.cc
@@ -167,20 +167,21 @@ Status GatherOutputs(const GrapplerItem& item, const FunctionInliningContext& ct
 
 
 Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimized_graph,
-                   std::unordered_map<string, FuncInfo> &functions_in, int call_id) {
+                   std::unordered_map<string, FuncInfo> &functions_in, int call_id, string device) {
     const std::unordered_map<string, AttrValue> func_attr(func_node.attr().begin(), func_node.attr().end());
 
     DataType type;
     ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
 
+    NodeDef *call;
     for (int i = 0; i < func.signature().input_arg_size(); ++i) {
       const OpDef::ArgDef &arg = func.signature().input_arg(i);
 
       // Create and add in graph a Call node for every input arg
-      NodeDef *call = optimized_graph->add_node();
+      call = optimized_graph->add_node();
       call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
       call->set_op("Call");
-      call->set_device(func_node.device());
+      call->set_device(device);
       call->add_input(func_node.input(i));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
       (*call->mutable_attr())["T"].set_type(type);
@@ -199,7 +200,7 @@ Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimi
       NodeDef *ret = optimized_graph->add_node();
       ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
       ret->set_op("Return");
-      ret->set_device(func_node.device());
+      ret->set_device(device);
       // Counting on the fact that op name will be the same as the name given initially to function
       ret->add_input(strings::StrCat(func_node.op(), "/", functions_in[func_node.op()].fetch[i]));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "output", arg, &type));
@@ -207,6 +208,9 @@ Status CreateCycle(NodeDef& func_node, const FunctionDef& func, GraphDef* optimi
       (*ret->mutable_attr())["frame_name"].set_s(func_node.op());
       (*ret->mutable_attr())["call_id"].set_i(call_id);
       (*ret->mutable_attr())["arg_id"].set_i(i);
+
+      // Add a control input from Call to Returns
+      *ret->add_input() = AsControlDependency(call->name());
     }
     return Status::OK();
 }
@@ -216,7 +220,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                       const FunctionInliningContext& ctx,
                       GraphDef* optimized_graph,
                       std::unordered_map<string, FuncInfo> &functions_in,
-                      int& frame_name) {
+                      int& frame_name, string device) {
 
     int cpframe_name = frame_name;
 
@@ -237,16 +241,17 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
     functions_in[func_node.op()].fetch = item->fetch;
     ArgMergeMap& argmerge_map = functions_in[func_node.op()].argMergeMap;
 
+    NodeDef* call;
     for (int i = 0; i < func.signature().input_arg_size(); ++i) {
       const OpDef::ArgDef& arg = func.signature().input_arg(i);
 
       input_nodes[arg.name()] = i;
 
       // Create and add in graph a Call node for every input arg
-      NodeDef* call = optimized_graph->add_node();
+      call = optimized_graph->add_node();
       call->set_name(strings::StrCat(func_node.name(), "/", "Call_", i));
       call->set_op("Call");
-      call->set_device(func_node.device());
+      call->set_device(device);
       call->add_input(func_node.input(i));
       TF_RETURN_IF_ERROR(CopyArgType(func_node, func_attr, "input", arg, &type));
       (*call->mutable_attr())["T"].set_type(type);
@@ -259,7 +264,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       NodeDef* merge = optimized_graph->add_node();
       merge->set_name(strings::StrCat(func_node.name(), "/", "Merge_", i));
       merge->set_op("IdentityN");
-      merge->set_device(func_node.device());
+      merge->set_device(device);
       merge->add_input(call->name());
 
       argmerge_map.emplace(arg.name(), merge);
@@ -285,7 +290,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
           }
           input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
         }
-        // If the node has no input, make hook it up to the Merge nodes to ensure
+        // If the node has no input, hook it up to the Merge nodes to ensure
         // it runs in the same frame as the other nodes of the function body.
         if (func_body_node.input_size() == 0) {
           for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it) {
@@ -298,7 +303,8 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       func_body_node.set_name(strings::StrCat(func_node.name(), "/", func_body_node.name()));
 
       // Make sure the node is placed
-      func_body_node.set_device(func_node.device());
+      string dvc = func_body_node.device();
+      (dvc == "") ? (func_body_node.set_device(device)) : (func_body_node.set_device(dvc));
 
       // Check if a body node is itself a function
       const FunctionDef* func_body_node_func = ctx.FindInlinedFunction(func_body_node.op());
@@ -313,12 +319,12 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
         if (it == functions_in.end()) {
           FuncInfo func_info;
           functions_in.emplace(func_body_node.op(), func_info);
-          InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in, ++frame_name);
+          InlineFunction(func_body_node, *func_body_node_func, ctx, optimized_graph, functions_in, ++frame_name, device);
           functions_in.erase(func_body_node.op());
         } else {
           // Already in -> Insert Enter/Exit ops end create cycle
           //  (recursion or mutually recursive functions)
-          CreateCycle(func_body_node, *func_body_node_func, optimized_graph, functions_in, ++frame_name);
+          CreateCycle(func_body_node, *func_body_node_func, optimized_graph, functions_in, ++frame_name, device);
         }
       } else {
         // Move the node to the main graph
@@ -332,7 +338,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       NodeDef *ret = optimized_graph->add_node();
       ret->set_name(strings::StrCat(func_node.name(), "/", "Ret", i));
       ret->set_op("Return");
-      ret->set_device(func_node.device());
+      ret->set_device(device);
       // If it takes input from a function
       string input = item->fetch[i];
       if (foutputs.find(input) != foutputs.end()) {
@@ -345,9 +351,11 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       (*ret->mutable_attr())["frame_name"].set_s(func_node.op());
       (*ret->mutable_attr())["call_id"].set_i(cpframe_name);
       (*ret->mutable_attr())["arg_id"].set_i(i);
+
+      // Add a control input from Call to Returns
+      *ret->add_input() = AsControlDependency(call->name());
     }
 
-    // Break IdentityN Merges into multiple common Binary Merge ops
     int j=0;
     for (auto it = argmerge_map.begin(); it != argmerge_map.end(); ++it, ++j) {
         DataType type;
@@ -359,7 +367,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
 
         if (size <= 1) {
             merge->set_op("Identity");
-            merge->set_device(func_node.device());
+            merge->set_device(device);
             (*merge->mutable_attr())["T"].set_type(type);
         } else {
             merge->set_op("Merge");
@@ -406,8 +414,11 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
       const FunctionDef* func = ctx.FindInlinedFunction(node.op());
       if (func != nullptr) {
         FuncInfo func_info;
+        // All the special nodes of this function and its 'callee-functions' too,
+        // will colocate in the same device (important for distributed)
+        string device = node.device();
         functions_in.emplace(node.op(), func_info);
-        InlineFunction(node, *func, ctx, optimized_graph, functions_in, ++frame_name);
+        InlineFunction(node, *func, ctx, optimized_graph, functions_in, ++frame_name, device);
         functions_in.erase(node.op());      // At this point functions_in will be empty
 
         // Check if the function node corresponded to some fetch_outputs
@@ -427,7 +438,7 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
               idN = optimized_graph->add_node();
               idN->set_op("IdentityN");
               idN->set_name(node.name());
-              idN->set_device(node.device());
+              idN->set_device(device);
 
               AttrValue::ListValue* type_list = (*idN->mutable_attr())["T"].mutable_list();
               for (const OpDef::ArgDef& arg : func->signature().output_arg()) {
@@ -472,7 +483,7 @@ Status FunctionTransformation::Optimize(Cluster* cluster, const GrapplerItem& it
     const void* bf = buf;
     event.set_graph_def(bf, proto_size);
     writer.WriteEvent(event);
-    /******************************************************************************************************/
+    ******************************************************************************************************/
 
     return Status::OK();
 }