UM-CSCI-591-Spring-2020 · colligant · Feb 11, 2020
diff --git a/week_3_parallelism.ipynb b/week_3_parallelism.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -159,14 +159,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0 Loss 2.169349 Accuracy 0.610571 time taken 759 ms\n",
+      "Epoch 10 Loss 0.776896 Accuracy 0.889714 time taken 697 ms\n",
+      "Epoch 20 Loss 0.458293 Accuracy 0.911143 time taken 715 ms\n",
+      "Epoch 30 Loss 0.335383 Accuracy 0.917429 time taken 831 ms\n",
+      "Epoch 40 Loss 0.269738 Accuracy 0.920857 time taken 704 ms\n",
+      "Epoch 50 Loss 0.228180 Accuracy 0.923714 time taken 776 ms\n",
+      "Epoch 60 Loss 0.199441 Accuracy 0.924571 time taken 727 ms\n",
+      "Epoch 70 Loss 0.178256 Accuracy 0.926000 time taken 690 ms\n",
+      "Epoch 80 Loss 0.161905 Accuracy 0.926000 time taken 851 ms\n",
+      "Epoch 90 Loss 0.148855 Accuracy 0.925714 time taken 822 ms\n",
+      "Epoch 100 Loss 0.138163 Accuracy 0.926857 time taken 835 ms\n",
+      "Epoch 110 Loss 0.129164 Accuracy 0.926286 time taken 739 ms\n",
+      "Epoch 120 Loss 0.121513 Accuracy 0.928857 time taken 716 ms\n",
+      "Epoch 130 Loss 0.114957 Accuracy 0.929714 time taken 710 ms\n",
+      "Epoch 140 Loss 0.109212 Accuracy 0.930571 time taken 694 ms\n",
+      "Epoch 150 Loss 0.104233 Accuracy 0.930857 time taken 698 ms\n",
+      "Epoch 160 Loss 0.099832 Accuracy 0.931429 time taken 692 ms\n",
+      "Epoch 170 Loss 0.095905 Accuracy 0.932000 time taken 712 ms\n",
+      "Epoch 180 Loss 0.092406 Accuracy 0.932571 time taken 814 ms\n",
+      "Epoch 190 Loss 0.089245 Accuracy 0.932571 time taken 751 ms\n",
+      "Epoch 200 Loss 0.086360 Accuracy 0.933143 time taken 699 ms\n",
+      "Epoch 210 Loss 0.083693 Accuracy 0.934857 time taken 730 ms\n",
+      "Epoch 220 Loss 0.081272 Accuracy 0.934571 time taken 774 ms\n",
+      "Epoch 230 Loss 0.078996 Accuracy 0.934857 time taken 774 ms\n",
+      "Epoch 240 Loss 0.076915 Accuracy 0.935143 time taken 896 ms\n"
+     ]
+    }
+   ],
    "source": [
     "\"\"\"\n",
     "Vanilla Gradient Descent\n",
     "\"\"\"\n",
-    "\n",
+    "import os\n",
+    "os.environ['MKL_NUM_THREADS'] = '1'\n",
     "# Hyper Parameters\n",
     "eta = 1e-3\n",
     "initial_batch_size = 104\n",
@@ -245,28 +278,60 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "scrolled": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0 Loss 2.300931 Accuracy 0.092571 time taken 1264 ms\n",
+      "Epoch 10 Loss 2.231029 Accuracy 0.427429 time taken 979 ms\n",
+      "Epoch 20 Loss 2.114058 Accuracy 0.649714 time taken 959 ms\n",
+      "Epoch 30 Loss 1.974481 Accuracy 0.651429 time taken 970 ms\n",
+      "Epoch 40 Loss 1.833836 Accuracy 0.670857 time taken 979 ms\n",
+      "Epoch 50 Loss 1.701170 Accuracy 0.694000 time taken 952 ms\n",
+      "Epoch 60 Loss 1.579504 Accuracy 0.718857 time taken 1025 ms\n",
+      "Epoch 70 Loss 1.469206 Accuracy 0.744000 time taken 1052 ms\n",
+      "Epoch 80 Loss 1.369618 Accuracy 0.769143 time taken 949 ms\n",
+      "Epoch 90 Loss 1.279815 Accuracy 0.794286 time taken 1547 ms\n",
+      "Epoch 100 Loss 1.198874 Accuracy 0.812571 time taken 1002 ms\n",
+      "Epoch 110 Loss 1.125911 Accuracy 0.833143 time taken 1093 ms\n",
+      "Epoch 120 Loss 1.060112 Accuracy 0.845714 time taken 1103 ms\n",
+      "Epoch 130 Loss 1.000736 Accuracy 0.857429 time taken 972 ms\n",
+      "Epoch 140 Loss 0.947098 Accuracy 0.868857 time taken 1225 ms\n",
+      "Epoch 150 Loss 0.898566 Accuracy 0.877714 time taken 1047 ms\n",
+      "Epoch 160 Loss 0.854572 Accuracy 0.884286 time taken 1027 ms\n",
+      "Epoch 170 Loss 0.814600 Accuracy 0.888000 time taken 990 ms\n",
+      "Epoch 180 Loss 0.778190 Accuracy 0.892000 time taken 1074 ms\n",
+      "Epoch 190 Loss 0.744937 Accuracy 0.896000 time taken 968 ms\n",
+      "Epoch 200 Loss 0.714484 Accuracy 0.899143 time taken 975 ms\n",
+      "Epoch 210 Loss 0.686519 Accuracy 0.902286 time taken 952 ms\n",
+      "Epoch 220 Loss 0.660771 Accuracy 0.903714 time taken 1045 ms\n",
+      "Epoch 230 Loss 0.637003 Accuracy 0.904286 time taken 1186 ms\n",
+      "Epoch 240 Loss 0.615008 Accuracy 0.905714 time taken 1081 ms\n"
+     ]
+    }
+   ],
    "source": [
     "\"\"\"\n",
     "Vanilla Gradient Descent with Data Parallelism\n",
     "\"\"\"\n",
     "\n",
     "#import the ThreadPool\n",
-    "from multiprocessing.Pool import ThreadPool\n",
-    "\n",
+    "from multiprocessing.pool import ThreadPool\n",
+    "# from multiprocessing import Pool as ThreadPool\n",
     "\n",
     "# Hyper Parameters\n",
     "eta = 1e-3\n",
     "initial_batch_size = 104\n",
     "epochs = 250\n",
     "\n",
     "#add additional hyperparameters related to the data parallelism\n",
-    "threads_in_pool = 4\n",
-    "parallel_batches = 4\n",
+    "threads_in_pool = 20\n",
+    "parallel_batches = 16\n",
     "\n",
     "#create the thread pool\n",
     "pool = ThreadPool(processes=threads_in_pool) \n",
@@ -292,7 +357,7 @@
     "    \n",
     "    \n",
     "    # perform gradient descent on mini batches\n",
-    "    for j in range(0, num_batches, parallel_batches):\n",
+    "    for j in range(0, num_batches-parallel_batches, parallel_batches):\n",
     "        \n",
     "        #create the list of inputs for the pool threads\n",
     "        #this might look weird, but by creating a list of tuples, the input data can be easily given to\n",
@@ -301,18 +366,18 @@
     "        for k in range(parallel_batches):\n",
     "            minibatchGradientInputLists.append((W1, W2, b1, b2, x_batches[j+k], y_batches[j+k]))\n",
     "        \n",
-    "        #TODO: use the ThreadPool's map function to compute minibatch gradients in parallel.\n",
-    "        gradientOutputs = pool.map(???, ???)\n",
-    "        \n",
-    "        \n",
+    "        # TODO: use the ThreadPool's map function to compute minibatch gradients in parallel.\n",
+    "        gradientOutputs = pool.map(computeMinibatchGradientsTuple, minibatchGradientInputLists)\n",
+    "        # grad_W1, grad_W2, grad_b1, grad_b2\n",
     "        '''\n",
     "        use the gradients to update weights and biases\n",
     "        '''\n",
     "        gradients = list(zip(*gradientOutputs))\n",
-    "        W1 -= eta * ??? #TODO: average (np.mean()) the W1 gradients we put into a list above.\n",
-    "        W2 -= eta * ??? #TODO: average (np.mean()) the W2 gradients we put into a list above.\n",
-    "        b1 -= eta * ??? #TODO: average (np.mean()) the b1 gradients we put into a list above.\n",
-    "        b2 -= eta * ??? #TODO: average (np.mean()) the b2 gradients we put into a list above.\n",
+    "        # gradients = np.mean(gradients, axis=1)\n",
+    "        W1 -= eta * np.mean(gradients[0], axis=0) # TODO: average (np.mean()) the W1 gradients we put into a list above.\n",
+    "        W2 -= eta * np.mean(gradients[1], axis=0) # TODO: average (np.mean()) the W2 gradients we put into a list above.\n",
+    "        b1 -= eta * np.mean(gradients[2], axis=0) # TODO: average (np.mean()) the b1 gradients we put into a list above.\n",
+    "        b2 -= eta * np.mean(gradients[3], axis=0) # TODO: average (np.mean()) the b2 gradients we put into a list above.\n",
     "\n",
     "    # calc loss at end of each epoch\n",
     "    y_entire_pred, Z1 = feedforward(X, W1, W2, b1, b2)\n",
@@ -357,13 +422,32 @@
     "4. Using a library like mpi4py, we could take the local, thread-parallel approach and do it in a true distributed environment. If the computeMinibatchGradients function was being run on different processors in a distributed system, what data would you have to send to the processors for each minibatch? What information would these distributed processors need to send back?\n",
     "\n",
     "\n",
-    "4. As we discussed in class on Tuesday, model parallelism involves splitting up a network between processors such that different portions of the same layer might be computed on different processors. Knowing that the example network is comprised of two full-connected layers, what changes would you have to make to the code to be able to employ model parallelism. (Note, actually doing this would be an enormous amount of work, but think critically about which parts of the network would need to be rewritten to achieve model parallelism.) \n",
+    "5. As we discussed in class on Tuesday, model parallelism involves splitting up a network between processors such that different portions of the same layer might be computed on different processors. Knowing that the example network is comprised of two full-connected layers, what changes would you have to make to the code to be able to employ model parallelism. (Note, actually doing this would be an enormous amount of work, but think critically about which parts of the network would need to be rewritten to achieve model parallelism.) \n",
+    "\n",
+    "\n",
+    "6. Pipeline parallelism involves splitting up a network between processors such that each processor is responsible for one or more contiguous operators. How might you change the example to perform pipeline-parallelism? Would this be easier to implement than model parallelism, or harder?\n",
+    "\n",
+    "\n",
+    "7. If a pipeline-parallel network such as the one from the previous question was implemented, how would data quantization help improve performance in a distributed environment?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. My computer is slower when parallelizing the neural network. This is (probably) due to the fact that numpy operations are already parallelized, and there are enouhg threads that the overhead of switching between them outweighs the performance improvements. I'm sure it would be faster if I limited numpy to one thread.\n",
+    "\n",
+    "2. My Mac runs fastest with 1 thread. Second fastest is 2 threads, and performance decreases from there. This makes sense as my computer only has two cores. As the number of threads increases, the amount of overhead the OS increases, slowing down the computation. The other reason for the slowdown was discussed in (1).\n",
+    "\n",
+    "3. When there are more parallel batches, the performance of each batch increases. However, as the minibatch size increases, generalization decreases. This is because large batches \"can increase ... the gradient variance and learning rate\", and this can hinder convergence. Also, SGD becomes closer to GD as the minibatch size increases and this results in the loss of some SGD benefits. In this example, we should choose the parallel_batches parameter to optimize speed of computation and model accuracy.\n",
     "\n",
+    "4. The weights of the model would have to be sent to each distributed computer, as well as the data each processor was working on. The gradients would also need to be sent back to each distributed cluster after the loss-reduce is performed at the end.\n",
     "\n",
-    "5. Pipeline parallelism involves splitting up a network between processors such that each processor is responsible for one or more contiguous operators. How might you change the example to perform pipeline-parallelism? Would this be easier to implement than model parallelism, or harder?\n",
+    "5. Matrix multiplication can be done block-wise. I would spend time parallelizing the computation of each matrix-matrix product that corresponds to a fully-connected layer. Element-wise operators (add, subtract, application of an activation) could trivially be done in parallel.\n",
     "\n",
+    "6. I would do one matmul (corresponding to a fc layer) on one processor, and the next on a second processor. This would probably require a decent amount of mental overhead to keep all the operations ordered, and to merge them after the batch is done. To actually change the example, I'd have to split computeMinibatchGradient into parts. I think this would be a little more difficult that model parallelism.\n",
     "\n",
-    "6. If a pipeline-parallel network such as the one from the previous question was implemented, how would data quantization help improve performance in a distributed environment?"
+    "7. Arithmetic operations would be much faster. There would also be much less data to move around, so weights/gradients could be transferred through the system much faster. The distributed environment could spend less time on data transfer and more time on computation."
    ]
   },
   {
@@ -376,9 +460,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python3 (nn)",
    "language": "python",
-   "name": "python3"
+   "name": "nn"
   },
   "language_info": {
    "codemirror_mode": {
@@ -390,7 +474,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.5"
+   "version": "3.7.3"
   },
   "toc": {
    "base_numbering": 1,