From a6e3800759a1445d0deca5ee5108321a8b2c4291 Mon Sep 17 00:00:00 2001
From: Akerele Abraham <abrahamakerele38@gmail.com>
Date: Wed, 14 May 2025 13:28:18 +0100
Subject: [PATCH 1/5] Add simde

---
 .gitmodules    |  3 +++
 CMakeLists.txt | 29 ++++++++++++++++++++++-------
 simde          |  1 +
 3 files changed, 26 insertions(+), 7 deletions(-)
 create mode 160000 simde
diff --git a/.gitmodules b/.gitmodules
index 83f3148..ed1c1f3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "libjpeg-turbo"]
 	path = libjpeg-turbo
 	url = https://github.com/libjpeg-turbo/libjpeg-turbo.git
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28e6c29..83ec921 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,24 +2,39 @@ cmake_minimum_required(VERSION 2.8.12...3.28)
 
 project(NativeSticher LANGUAGES C)
 
-set(LIBJPEG_TURBO_ROOT "${LIBJPEG_TURBO_ROOT}") 
+set(LIBJPEG_TURBO_ROOT      "${LIBJPEG_TURBO_ROOT}")
 set(LIBJPEG_TURBO_INCLUDE_DIR "${LIBJPEG_TURBO_ROOT}/include")
-set(LIBJPEG_TURBO_LIB_DIR "${LIBJPEG_TURBO_ROOT}/lib")
+set(LIBJPEG_TURBO_LIB_DIR     "${LIBJPEG_TURBO_ROOT}/lib")
 set(LIBJPEG_LIBS turbojpeg)
 
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-    message(FATAL_ERROR "CMAKE_INSTALL_PREFIX not set. Please provide a prefix using -DCMAKE_INSTALL_PREFIX=<path>")
+    message(FATAL_ERROR
+      "CMAKE_INSTALL_PREFIX not set. "
+      "Please provide a prefix using -DCMAKE_INSTALL_PREFIX=<path>")
 endif()
 
 
 include_directories(${LIBJPEG_TURBO_INCLUDE_DIR})
 link_directories(${LIBJPEG_TURBO_LIB_DIR})
 
-add_library(${PROJECT_NAME} SHARED laplace_blending.c jpeg.c utils.c)
+add_library(${PROJECT_NAME} SHARED
+    laplace_blending.c
+    jpeg.c
+    utils.c
+)
+
+target_compile_options(${PROJECT_NAME} PRIVATE -pthread)
+target_link_libraries(${PROJECT_NAME} PRIVATE -pthread)
+
 target_link_libraries(${PROJECT_NAME} PRIVATE ${LIBJPEG_LIBS})
 
+install(TARGETS ${PROJECT_NAME}
+        LIBRARY DESTINATION lib)
 
-install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION lib)
-install(DIRECTORY ${LIBJPEG_TURBO_INCLUDE_DIR}/ DESTINATION include) 
-install(FILES laplace_blending.h utils.h jpeg.h  DESTINATION include)
+install(DIRECTORY ${LIBJPEG_TURBO_INCLUDE_DIR}/
+        DESTINATION include)
 
+install(FILES laplace_blending.h
+              utils.h
+              jpeg.h
+        DESTINATION include)
diff --git a/simde b/simde
new file mode 160000
index 0000000..51743e7
--- /dev/null
+++ b/simde
@@ -0,0 +1 @@
+Subproject commit 51743e7920b6e867678cb50e9c62effe28f70b33

From 94123724be7339e76ea6b1a929bb80df7733ab35 Mon Sep 17 00:00:00 2001
From: Akerele Abraham <abrahamakerele38@gmail.com>
Date: Sat, 17 May 2025 12:21:49 +0100
Subject: [PATCH 2/5] refactor code to add new operation file

---
 CMakeLists.txt          |   12 +-
 README.md               |   26 +-
 examples/downsampling.c |    8 +-
 image_operations.c      |  213 ++++++++
 image_operations.h      |  156 ++++++
 laplace_blending.c      | 1071 +++++++++++++++------------------------
 laplace_blending.h      |  116 +----
 tests/test.c            |    4 +-
 8 files changed, 797 insertions(+), 809 deletions(-)
 create mode 100644 image_operations.c
 create mode 100644 image_operations.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83ec921..a401648 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,17 +14,22 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 endif()
 
 
-include_directories(${LIBJPEG_TURBO_INCLUDE_DIR})
+include_directories(
+    ${LIBJPEG_TURBO_INCLUDE_DIR}
+     ${CMAKE_SOURCE_DIR}/simde
+)
 link_directories(${LIBJPEG_TURBO_LIB_DIR})
 
 add_library(${PROJECT_NAME} SHARED
+    image_operations.c
     laplace_blending.c
     jpeg.c
     utils.c
 )
 
-target_compile_options(${PROJECT_NAME} PRIVATE -pthread)
+target_compile_options(${PROJECT_NAME} PRIVATE -O3 -pthread)
 target_link_libraries(${PROJECT_NAME} PRIVATE -pthread)
+target_compile_definitions(${PROJECT_NAME} PRIVATE SIMDE_ENABLE_NATIVE_ALIASES)
 
 target_link_libraries(${PROJECT_NAME} PRIVATE ${LIBJPEG_LIBS})
 
@@ -34,7 +39,8 @@ install(TARGETS ${PROJECT_NAME}
 install(DIRECTORY ${LIBJPEG_TURBO_INCLUDE_DIR}/
         DESTINATION include)
 
-install(FILES laplace_blending.h
+install(FILES image_operations.h
+              laplace_blending.h
               utils.h
               jpeg.h
         DESTINATION include)
diff --git a/README.md b/README.md
index e807d98..2da0c3b 100644
--- a/README.md
+++ b/README.md
@@ -1,40 +1,40 @@
 # NativeSticher
-**NativeSticher** is a C library for multiband image merging using the Laplacian blending technique.  
+**NativeSticher** is a C library for multiband image merging using the Laplacian blending technique.
 It enables seamless blending of overlapping images, useful for applications such as panorama generation, image stitching, and exposure fusion. Only supports JPEGS for now.
 
 ---
 
 ## 🚧 Building
-A helper script (`build.sh`) is included to streamline the build process for different platforms.  
-Use this script to build or clean the project easily.  
+A helper script (`build.sh`) is included to streamline the build process for different platforms.
+Use this script to build or clean the project easily.
 
 ### 🔨 Build Instructions:
-- **Build for macOS:**  
+- **Build for macOS:**
   ```bash
   ./build.sh build macos
   ```
-- **Build for ios:**  
+- **Build for ios:**
   ```bash
   ./build.sh build ios
   ```
 
-# 🧪 Testing  
+# 🧪 Testing
 
-To verify the functionality of **NativeSticher**, follow the instructions below based on your setup.  
+To verify the functionality of **NativeSticher**, follow the instructions below based on your setup.
 
 ---
 
-### 1. Testing with libturbojpeg (Direct Compilation)  
-If you have `libturbojpeg` installed, compile and run the test with the following command:  
+### 1. Testing with libturbojpeg (Direct Compilation)
+If you have `libturbojpeg` installed, compile and run the test with the following command:
 ```bash
 gcc-14 -pthread -fsanitize=address -g -o stitch \
 -I../ -I/usr/local/include \
 -L/usr/local/lib -lturbojpeg \
-stitch.c ../laplace_blending.c ../jpeg.c ../utils.c && time ./stitch
+stitch.c ../laplace_blending.c ../jpeg.c ../image_operations.c ../utils.c && time ./stitch
 ```
 
-### 2. Testing with Custom-Built NativeSticher Library  
-If you have manually built the NativeSticher library, use the following command to test it:  
+### 2. Testing with Custom-Built NativeSticher Library
+If you have manually built the NativeSticher library, use the following command to test it:
 ```bash
 gcc-14 -pthread -fsanitize=address -g -o stitch \
 -I../installs/native-stitcher/macos/x86_64/include \
@@ -43,5 +43,3 @@ gcc-14 -pthread -fsanitize=address -g -o stitch \
 -lNativeSticher stitch.c && ./stitch
 ```
 The commands above assume a mac as working machine.
-
-
diff --git a/examples/downsampling.c b/examples/downsampling.c
index da3e733..64d396d 100644
--- a/examples/downsampling.c
+++ b/examples/downsampling.c
@@ -25,7 +25,7 @@ int main()
 
     Image *img = &img_buf1;
     char buf[100];
-    
+
     for (int i = 0; i < 3; i++)
     {
         Image down = downsample(img);
@@ -53,11 +53,11 @@ int main()
         img->width = up.width;
         img->height = up.height;
     }
-    
-    
+
+
 
     destroy_image(&img_buf1);
     destroy_image(&down);
     destroy_image(&up);
     destroy_image(&mask1);
-}
\ No newline at end of file
+}
diff --git a/image_operations.c b/image_operations.c
new file mode 100644
index 0000000..5ed16a3
--- /dev/null
+++ b/image_operations.c
@@ -0,0 +1,213 @@
+
+#include "image_operations.h"
+#include "installs/native-stitcher/macos/x86_64/include/jpeg.h"
+
+Image create_image(const char *filename) { return decompress_jpeg(filename); }
+
+#define DEFINE_CREATE_IMAGE_FUNC(NAME, IMAGE_T, PIXEL_T)                       \
+  IMAGE_T NAME(int width, int height, int channels) {                          \
+    IMAGE_T img;                                                               \
+    img.data = (PIXEL_T *)calloc(width * height * channels, sizeof(PIXEL_T));  \
+    if (!img.data) {                                                           \
+      return img;                                                              \
+    }                                                                          \
+    img.channels = channels;                                                   \
+    img.width = width;                                                         \
+    img.height = height;                                                       \
+    return img;                                                                \
+  }
+
+DEFINE_CREATE_IMAGE_FUNC(create_empty_image, Image, unsigned char)
+DEFINE_CREATE_IMAGE_FUNC(create_empty_image_s, ImageS, short)
+DEFINE_CREATE_IMAGE_FUNC(create_empty_image_f, ImageF, float)
+
+Image create_image_mask(int width, int height, float range, int left,
+                        int right) {
+  return create_mask(width, height, range, left, right);
+}
+
+int save_image(const Image *img, char *out_filename) {
+  if (img->channels == CHANNELS) {
+    return compress_jpeg(out_filename, img, 100);
+  } else {
+    return compress_grayscale_jpeg(out_filename, img, 100);
+  }
+}
+
+void crop_image(Image *img, int cut_top, int cut_bottom, int cut_left,
+                int cut_right) {
+  crop_image_buf(img, cut_top, cut_bottom, cut_left, cut_right, img->channels);
+}
+
+#define DEFINE_DESTROY_IMAGE_FUNC(NAME, PIXEL_T)                               \
+  void NAME(PIXEL_T *img) {                                                    \
+    if (img->data != NULL) {                                                   \
+      free(img->data);                                                         \
+    }                                                                          \
+  }
+
+DEFINE_DESTROY_IMAGE_FUNC(destroy_image, Image)
+DEFINE_DESTROY_IMAGE_FUNC(destroy_image_s, ImageS)
+DEFINE_DESTROY_IMAGE_FUNC(destroy_image_f, ImageF)
+
+#define DEFINE_IMAGE_SIZE_FUNC(NAME, IMAGE_T)                                  \
+  int NAME(IMAGE_T *img) { return img->channels * img->height * img->width; }
+
+DEFINE_IMAGE_SIZE_FUNC(image_size, Image)
+DEFINE_IMAGE_SIZE_FUNC(image_size_s, ImageS)
+DEFINE_IMAGE_SIZE_FUNC(image_size_f, ImageF)
+
+Rect create_rect(int x, int y, int width, int height) {
+  Rect rect;
+  rect.x = x;
+  rect.y = y;
+  rect.width = width;
+  rect.height = height;
+  return rect;
+}
+
+Point br(Rect r) {
+  Point result;
+  result.x = r.x + r.width;
+  result.y = r.y + r.height;
+  return result;
+}
+
+#define DEFINE_DOWNSAMPLE_WORKER_FUNC(NAME, IMAGE_T, PIXEL_T)                  \
+  void *NAME(void *args) {                                                     \
+    ThreadArgs *arg = (ThreadArgs *)args;                                      \
+    int start_row = arg->start_index;                                          \
+    int end_row = arg->end_index;                                              \
+    SamplingThreadData *data =                                                 \
+        (SamplingThreadData *)arg->workerThreadArgs->std;                      \
+    IMAGE_T *img = (IMAGE_T *)data->img;                                       \
+    int imageSize = image_size(data->img);                                     \
+    PIXEL_T *sampled = (PIXEL_T *)data->sampled;                               \
+    for (int y = start_row; y < end_row; ++y) {                                \
+      for (int x = 0; x < data->new_width; ++x) {                              \
+        for (char c = 0; c < img->channels; ++c) {                             \
+          float sum = 0.0;                                                     \
+          for (int i = -2; i < 3; i++) {                                       \
+            for (int j = -2; j < 3; j++) {                                     \
+              int src_row = 2 * y + i;                                         \
+              int src_col = 2 * x + j;                                         \
+              int rr = reflect_index(src_row, img->height);                    \
+              int cc = reflect_index(src_col, img->width);                     \
+              int pos = (cc + rr * img->width) * img->channels + c;            \
+              if (pos < imageSize) {                                           \
+                sum += GAUSSIAN_KERNEL[i + 2][j + 2] * img->data[pos];         \
+              }                                                                \
+            }                                                                  \
+          }                                                                    \
+          if (data->image_type == IMAGE) {                                     \
+            sum = (PIXEL_T)clamp(ceil(sum), 0, 255);                           \
+          }                                                                    \
+          sampled[(y * data->new_width + x) * img->channels + c] = sum;        \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    return NULL;                                                               \
+  }
+
+DEFINE_DOWNSAMPLE_WORKER_FUNC(down_sample_operation, Image, unsigned char)
+DEFINE_DOWNSAMPLE_WORKER_FUNC(down_sample_operation_f, ImageF, float)
+DEFINE_DOWNSAMPLE_WORKER_FUNC(down_sample_operation_s, ImageS, short)
+
+#define DEFINE_DOWNSAMPLE_FUNC(NAME, IMAGE_T, PIXEL_T, IMAGE_T_ENUM)           \
+  IMAGE_T NAME(IMAGE_T *img) {                                                 \
+    IMAGE_T result;                                                            \
+    int new_width = img->width / 2;                                            \
+    int new_height = img->height / 2;                                          \
+    PIXEL_T *downsampled = (PIXEL_T *)malloc(new_width * new_height *          \
+                                             img->channels * sizeof(PIXEL_T)); \
+    if (!downsampled) {                                                        \
+      return result;                                                           \
+    }                                                                          \
+    SamplingThreadData std = {0,   new_width,   new_height,                    \
+                              img, downsampled, IMAGE_T_ENUM};                 \
+    WorkerThreadArgs wtd;                                                      \
+    wtd.std = &std;                                                            \
+    ParallelOperatorArgs args = {new_height, &wtd};                            \
+    parallel_operator(DOWNSAMPLE, &args);                                      \
+    result.channels = img->channels;                                           \
+    result.data = downsampled;                                                 \
+    result.width = new_width;                                                  \
+    result.height = new_height;                                                \
+    return result;                                                             \
+  }
+
+DEFINE_DOWNSAMPLE_FUNC(downsample, Image, unsigned char, IMAGE)
+DEFINE_DOWNSAMPLE_FUNC(downsample_s, ImageS, short, IMAGES)
+DEFINE_DOWNSAMPLE_FUNC(downsample_f, ImageF, float, IMAGEF)
+
+#define DEFINE_UPSAMPLE_WORKER_FUNC(NAME, IMAGE_T, PIXEL_T)                    \
+  void *NAME(void *args) {                                                     \
+    ThreadArgs *arg = (ThreadArgs *)args;                                      \
+    int start_row = arg->start_index;                                          \
+    int end_row = arg->end_index;                                              \
+    SamplingThreadData *s = (SamplingThreadData *)arg->workerThreadArgs->std;  \
+    IMAGE_T *img = (IMAGE_T *)s->img;                                          \
+    PIXEL_T *sampled = (PIXEL_T *)s->sampled;                                  \
+    int pad = 2;                                                               \
+    for (int y = start_row; y < end_row; ++y) {                                \
+      for (int x = 0; x < s->new_width; ++x) {                                 \
+        for (char c = 0; c < img->channels; ++c) {                             \
+          float sum = 0;                                                       \
+          for (int ki = 0; ki < 5; ki++) {                                     \
+            for (int kj = 0; kj < 5; kj++) {                                   \
+              int src_i = reflect_index(y + ki - pad, s->new_height);          \
+              int src_j = reflect_index(x + kj - pad, s->new_width);           \
+              int pixel_val = 0;                                               \
+              if (src_i % 2 == 0 && src_j % 2 == 0) {                          \
+                int orig_i = src_i / 2;                                        \
+                int orig_j = src_j / 2;                                        \
+                int image_pos =                                                \
+                    (orig_i * img->width + orig_j) * img->channels + c;        \
+                pixel_val = img->data[image_pos] * s->upsample_factor;         \
+              }                                                                \
+              sum += GAUSSIAN_KERNEL[ki][kj] * pixel_val;                      \
+            }                                                                  \
+          }                                                                    \
+          int up_image_pos = (y * s->new_width + x) * img->channels + c;       \
+          if (s->image_type == IMAGE) {                                        \
+            sum = (PIXEL_T)clamp(floor(sum + 0.5), 0, 255);                    \
+          }                                                                    \
+          sampled[up_image_pos] = sum;                                         \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    return NULL;                                                               \
+  }
+
+DEFINE_UPSAMPLE_WORKER_FUNC(upsample_worker, Image, unsigned char)
+DEFINE_UPSAMPLE_WORKER_FUNC(upsample_worker_s, ImageS, short)
+DEFINE_UPSAMPLE_WORKER_FUNC(upsample_worker_f, ImageF, float)
+
+#define DEFINE_UPSAMPLE_FUNC(NAME, IMAGE_T, PIXEL_T, IMAGE_T_ENUM)             \
+  IMAGE_T NAME(IMAGE_T *img, float upsample_factor) {                          \
+    IMAGE_T result;                                                            \
+    int new_width = img->width * 2;                                            \
+    int new_height = img->height * 2;                                          \
+    PIXEL_T *upsampled = (PIXEL_T *)calloc(                                    \
+        new_width * new_height * img->channels, sizeof(PIXEL_T));              \
+    if (!upsampled) {                                                          \
+      result.data = NULL;                                                      \
+      result.width = result.height = result.channels = 0;                      \
+      return result;                                                           \
+    }                                                                          \
+    SamplingThreadData std = {upsample_factor, new_width,   new_height, img,   \
+                              upsampled,       IMAGE_T_ENUM};                  \
+    WorkerThreadArgs wtd;                                                      \
+    wtd.std = &std;                                                            \
+    ParallelOperatorArgs args = {new_height, &wtd};                            \
+    parallel_operator(UPSAMPLE, &args);                                        \
+    result.data = upsampled;                                                   \
+    result.width = new_width;                                                  \
+    result.height = new_height;                                                \
+    result.channels = img->channels;                                           \
+    return result;                                                             \
+  }
+
+DEFINE_UPSAMPLE_FUNC(upsample, Image, unsigned char, IMAGE)
+DEFINE_UPSAMPLE_FUNC(upsample_image_s, ImageS, short, IMAGES)
+DEFINE_UPSAMPLE_FUNC(upsample_image_f, ImageF, float, IMAGES)
diff --git a/image_operations.h b/image_operations.h
new file mode 100644
index 0000000..d875d65
--- /dev/null
+++ b/image_operations.h
@@ -0,0 +1,156 @@
+#include "turbojpeg.h"
+#include <stdlib.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#include "jpeg.h"
+#include "utils.h"
+
+static const float GAUSSIAN_KERNEL[5][5] = {
+    {1.0 / 256, 4.0 / 256, 6.0 / 256, 4.0 / 256, 1.0 / 256},
+    {4.0 / 256, 16.0 / 256, 24.0 / 256, 16.0 / 256, 4.0 / 256},
+    {6.0 / 256, 24.0 / 256, 36.0 / 256, 24.0 / 256, 6.0 / 256},
+    {4.0 / 256, 16.0 / 256, 24.0 / 256, 16.0 / 256, 4.0 / 256},
+    {1.0 / 256, 4.0 / 256, 6.0 / 256, 4.0 / 256, 1.0 / 256}};
+
+static const float WEIGHT_EPS = 1e-5f;
+
+
+
+#define MAX_BANDS 7
+typedef enum
+{
+    DOWNSAMPLE,
+    UPSAMPLE,
+    LAPLACIAN,
+    FEED,
+    BLEND,
+    NORMALIZE
+} OperatorType;
+
+typedef struct
+{
+    int x;
+    int y;
+} Point;
+
+typedef struct
+{
+    int x;
+    int y;
+    int width;
+    int height;
+} Rect;
+
+Point br(Rect r);
+
+typedef struct
+{
+    float upsample_factor;
+    int new_width;
+    int new_height;
+    void *img;
+    void *sampled;
+    ImageType image_type;
+} SamplingThreadData;
+
+typedef struct
+{
+    ImageS *original;
+    ImageS *upsampled;
+    int total_size;
+} LaplacianThreadData;
+
+typedef struct
+{
+    int rows;
+    int cols;
+    int x_tl;
+    int y_tl;
+    int out_level_width;
+    int out_level_height;
+    int level_width;
+    int level_height;
+    int level;
+    ImageS *img_laplacians;
+    ImageS *mask_gaussian;
+    ImageF *out;
+    ImageF *out_mask;
+} FeedThreadData;
+
+typedef struct
+{
+    int output_width;
+    int level;
+    ImageF *out;
+    ImageF *out_mask;
+    ImageS *final_out;
+} NormalThreadData;
+
+typedef struct
+{
+    int out_size;
+    ImageS blended_image;
+    ImageS out_level;
+} BlendThreadData;
+
+typedef union
+{
+    SamplingThreadData *std;
+    LaplacianThreadData *ltd;
+    FeedThreadData *ftd;
+    BlendThreadData *btd;
+    NormalThreadData *ntd;
+} WorkerThreadArgs;
+
+typedef struct
+{
+    int rows;
+    WorkerThreadArgs *workerThreadArgs;
+} ParallelOperatorArgs;
+
+typedef struct
+{
+    int start_index;
+    int end_index;
+    WorkerThreadArgs *workerThreadArgs;
+} ThreadArgs;
+
+
+Image create_image(const char *filename);
+
+Image create_empty_image(int width, int height, int channels);
+ImageS create_empty_image_s(int width, int height, int channels);
+ImageF create_empty_image_f(int width, int height, int channels);
+
+Image create_image_mask(int width, int height, float range, int left, int right);
+int save_image(const Image *img, char *out_filename);
+
+int image_size(Image *img);
+int image_size_s(ImageS *img);
+int image_size_f(ImageF *img);
+
+void destroy_image(Image *img);
+void destroy_image_s(ImageS *img);
+void destroy_image_f(ImageF *img);
+
+Image upsample( Image *img,float upsample_factor);
+ImageS upsample_image_s( ImageS *img,float upsample_factor);
+ImageF upsample_image_f( ImageF *img,float upsample_factor);
+
+void *down_sample_operation(void *args);
+void *down_sample_operation_s(void *args);
+void *down_sample_operation_f(void *args);
+
+void *upsample_worker(void *args);
+void *upsample_worker_s(void *args);
+void *upsample_worker_f(void *args);
+
+Image downsample(Image *img);
+ImageS downsample_s(ImageS *img);
+ImageF downsample_f(ImageF *img);
+
+void crop_image(Image *img, int cut_top, int cut_bottom, int cut_left, int cut_right);
+void parallel_operator(OperatorType operatorType, ParallelOperatorArgs *arg);
diff --git a/laplace_blending.c b/laplace_blending.c
index 530473b..da4bf1b 100644
--- a/laplace_blending.c
+++ b/laplace_blending.c
@@ -1,761 +1,490 @@
 #include "turbojpeg.h"
-#include <stdlib.h>
+#include <math.h>
 #include <pthread.h>
 #include <stdio.h>
-#include <math.h>
-#include <time.h>
+#include <stdlib.h>
 #include <string.h>
+#include <time.h>
 
-#include "utils.h"
 #include "laplace_blending.h"
+#include "utils.h"
 
-static const float GAUSSIAN_KERNEL[5][5] = {
-    {1.0 / 256, 4.0 / 256, 6.0 / 256, 4.0 / 256, 1.0 / 256},
-    {4.0 / 256, 16.0 / 256, 24.0 / 256, 16.0 / 256, 4.0 / 256},
-    {6.0 / 256, 24.0 / 256, 36.0 / 256, 24.0 / 256, 6.0 / 256},
-    {4.0 / 256, 16.0 / 256, 24.0 / 256, 16.0 / 256, 4.0 / 256},
-    {1.0 / 256, 4.0 / 256, 6.0 / 256, 4.0 / 256, 1.0 / 256}};
-
-static const float WEIGHT_EPS = 1e-5f;
-
-Image create_image(const char *filename)
-{
-    return decompress_jpeg(filename);
-}
-
-#define DEFINE_CREATE_IMAGE_FUNC(NAME, IMAGE_T, PIXEL_T)                          \
-    IMAGE_T NAME(int width, int height, int channels)                             \
-    {                                                                             \
-        IMAGE_T img;                                                              \
-        img.data = (PIXEL_T *)calloc(width * height * channels, sizeof(PIXEL_T)); \
-        if (!img.data)                                                            \
-        {                                                                         \
-            return img;                                                           \
-        }                                                                         \
-        img.channels = channels;                                                  \
-        img.width = width;                                                        \
-        img.height = height;                                                      \
-        return img;                                                               \
-    }
-
-DEFINE_CREATE_IMAGE_FUNC(create_empty_image, Image, unsigned char)
-DEFINE_CREATE_IMAGE_FUNC(create_empty_image_s, ImageS, short)
-DEFINE_CREATE_IMAGE_FUNC(create_empty_image_f, ImageF, float)
-
-Image create_image_mask(int width, int height, float range, int left, int right)
-{
-    return create_mask(width, height, range, left, right);
-}
-
-int save_image(const Image *img, char *out_filename)
-{
-    if (img->channels == CHANNELS)
-    {
-        return compress_jpeg(out_filename, img, 100);
-    }
-    else
-    {
-        return compress_grayscale_jpeg(out_filename, img, 100);
-    }
-}
-
-void crop_image(Image *img, int cut_top, int cut_bottom, int cut_left, int cut_right)
-{
-    crop_image_buf(img, cut_top, cut_bottom, cut_left, cut_right, img->channels);
-}
-
-#define DEFINE_DESTROY_IMAGE_FUNC(NAME, PIXEL_T) \
-    void NAME(PIXEL_T *img)                      \
-    {                                            \
-        if (img->data != NULL)                   \
-        {                                        \
-            free(img->data);                     \
-        }                                        \
-    }
-
-DEFINE_DESTROY_IMAGE_FUNC(destroy_image, Image)
-DEFINE_DESTROY_IMAGE_FUNC(destroy_image_s, ImageS)
-DEFINE_DESTROY_IMAGE_FUNC(destroy_image_f, ImageF)
-
-#define DEFINE_IMAGE_SIZE_FUNC(NAME, IMAGE_T)            \
-    int NAME(IMAGE_T *img)                               \
-    {                                                    \
-        return img->channels * img->height * img->width; \
-    }
-
-DEFINE_IMAGE_SIZE_FUNC(image_size, Image)
-DEFINE_IMAGE_SIZE_FUNC(image_size_s, ImageS)
-DEFINE_IMAGE_SIZE_FUNC(image_size_f, ImageF)
-
-Rect create_rect(int x, int y, int width, int height)
-{
-    Rect rect;
-    rect.x = x;
-    rect.y = y;
-    rect.width = width;
-    rect.height = height;
-    return rect;
-}
+Blender *create_blender(Rect out_size, int nb) {
 
-Point br(Rect r)
-{
-    Point result;
-    result.x = r.x + r.width;
-    result.y = r.y + r.height;
-    return result;
-}
+  Blender *blender = (Blender *)malloc(sizeof(Blender));
+  blender->real_out_size = out_size;
+  if (!blender)
+    return NULL;
 
-Blender *create_blender(Rect out_size, int nb)
-{
-
-    Blender *blender = (Blender *)malloc(sizeof(Blender));
-    blender->real_out_size = out_size;
-    if (!blender)
-        return NULL;
-
-    blender->num_bands = min(MAX_BANDS, nb);
-
-    double max_len = (double)(out_size.width > out_size.height ? out_size.width : out_size.height);
-    blender->num_bands = min(blender->num_bands, (int)ceil(log(max_len) / log(2.0)));
-
-    out_size.width += ((1 << blender->num_bands) - out_size.width % (1 << blender->num_bands)) % (1 << blender->num_bands);
-    out_size.height += ((1 << blender->num_bands) - out_size.height % (1 << blender->num_bands)) % (1 << blender->num_bands);
-    blender->output_size = out_size;
-
-    blender->out = (ImageF *)malloc((blender->num_bands + 1) * sizeof(ImageF));
-    blender->final_out = (ImageS *)malloc((blender->num_bands + 1) * sizeof(ImageS));
-    blender->out_mask = (ImageF *)malloc((blender->num_bands + 1) * sizeof(ImageF));
-    blender->out_width_levels = (int *)malloc((blender->num_bands + 1) * sizeof(int));
-    blender->out_height_levels = (int *)malloc((blender->num_bands + 1) * sizeof(int));
-    blender->img_laplacians = (ImageS *)malloc((blender->num_bands + 1) * sizeof(Image));
-    blender->mask_gaussian = (ImageS *)malloc((blender->num_bands + 1) * sizeof(Image));
-
-    if (!blender->out ||
-        !blender->final_out ||
-        !blender->out_mask ||
-        !blender->out_width_levels ||
-        !blender->out_height_levels ||
-        !blender->img_laplacians ||
-        !blender->mask_gaussian)
-    {
-        free(blender->out);
-        free(blender->final_out);
-        free(blender->out_mask);
-        free(blender->out_width_levels);
-        free(blender->out_height_levels);
-        free(blender->img_laplacians);
-        free(blender->mask_gaussian);
-        free(blender);
-        return NULL;
-    }
+  blender->num_bands = min(MAX_BANDS, nb);
+
+  double max_len = (double)(out_size.width > out_size.height ? out_size.width
+                                                             : out_size.height);
+  blender->num_bands =
+      min(blender->num_bands, (int)ceil(log(max_len) / log(2.0)));
+
+  out_size.width +=
+      ((1 << blender->num_bands) - out_size.width % (1 << blender->num_bands)) %
+      (1 << blender->num_bands);
+  out_size.height += ((1 << blender->num_bands) -
+                      out_size.height % (1 << blender->num_bands)) %
+                     (1 << blender->num_bands);
+  blender->output_size = out_size;
+
+  blender->out = (ImageF *)malloc((blender->num_bands + 1) * sizeof(ImageF));
+  blender->final_out =
+      (ImageS *)malloc((blender->num_bands + 1) * sizeof(ImageS));
+  blender->out_mask =
+      (ImageF *)malloc((blender->num_bands + 1) * sizeof(ImageF));
+  blender->out_width_levels =
+      (int *)malloc((blender->num_bands + 1) * sizeof(int));
+  blender->out_height_levels =
+      (int *)malloc((blender->num_bands + 1) * sizeof(int));
+  blender->img_laplacians =
+      (ImageS *)malloc((blender->num_bands + 1) * sizeof(Image));
+  blender->mask_gaussian =
+      (ImageS *)malloc((blender->num_bands + 1) * sizeof(Image));
+
+  if (!blender->out || !blender->final_out || !blender->out_mask ||
+      !blender->out_width_levels || !blender->out_height_levels ||
+      !blender->img_laplacians || !blender->mask_gaussian) {
+    free(blender->out);
+    free(blender->final_out);
+    free(blender->out_mask);
+    free(blender->out_width_levels);
+    free(blender->out_height_levels);
+    free(blender->img_laplacians);
+    free(blender->mask_gaussian);
+    free(blender);
+    return NULL;
+  }
 
-    blender->out[0] = create_empty_image_f(out_size.width, out_size.height, 3);
-    blender->out_mask[0] = create_empty_image_f(out_size.width, out_size.height, 1);
+  blender->out[0] = create_empty_image_f(out_size.width, out_size.height, 3);
+  blender->out_mask[0] =
+      create_empty_image_f(out_size.width, out_size.height, 1);
 
-    blender->out_width_levels[0] = out_size.width;
-    blender->out_height_levels[0] = out_size.height;
+  blender->out_width_levels[0] = out_size.width;
+  blender->out_height_levels[0] = out_size.height;
 
-    for (int i = 1; i <= blender->num_bands; i++)
-    {
-        blender->out_width_levels[i] = (blender->out_width_levels[i - 1] + 1) / 2;
-        blender->out_height_levels[i] = (blender->out_height_levels[i - 1] + 1) / 2;
+  for (int i = 1; i <= blender->num_bands; i++) {
+    blender->out_width_levels[i] = (blender->out_width_levels[i - 1] + 1) / 2;
+    blender->out_height_levels[i] = (blender->out_height_levels[i - 1] + 1) / 2;
 
-        blender->out[i] = create_empty_image_f(blender->out_width_levels[i], blender->out_height_levels[i], 3);
-        blender->out_mask[i] = create_empty_image_f(blender->out_width_levels[i], blender->out_height_levels[i], 1);
-    }
+    blender->out[i] = create_empty_image_f(blender->out_width_levels[i],
+                                           blender->out_height_levels[i], 3);
+    blender->out_mask[i] = create_empty_image_f(
+        blender->out_width_levels[i], blender->out_height_levels[i], 1);
+  }
 
-    return blender;
+  return blender;
 }
 
-void destroy_blender(Blender *blender)
-{
-    if (!blender)
-        return;
+void destroy_blender(Blender *blender) {
+  if (!blender)
+    return;
 
-    if (blender->out != NULL)
-    {
-        free(blender->out);
-    }
+  if (blender->out != NULL) {
+    free(blender->out);
+  }
 
-    if (blender->out_mask != NULL)
-    {
-        free(blender->out_mask);
-    }
+  if (blender->out_mask != NULL) {
+    free(blender->out_mask);
+  }
 
-    free(blender->out_width_levels);
-    free(blender->out_height_levels);
+  free(blender->out_width_levels);
+  free(blender->out_height_levels);
 
-    destroy_image(&blender->result);
-    destroy_image_s(blender->final_out);
+  destroy_image(&blender->result);
+  destroy_image_s(blender->final_out);
 
-    free(blender->img_laplacians);
-    free(blender->mask_gaussian);
+  free(blender->img_laplacians);
+  free(blender->mask_gaussian);
 
-    free(blender);
+  free(blender);
 }
 
-#define DEFINE_DOWNSAMPLE_WORKER_FUNC(NAME, IMAGE_T, PIXEL_T)                          \
-    void *NAME(void *args)                                                             \
-    {                                                                                  \
-        ThreadArgs *arg = (ThreadArgs *)args;                                          \
-        int start_row = arg->start_index;                                              \
-        int end_row = arg->end_index;                                                  \
-        SamplingThreadData *data = (SamplingThreadData *)arg->workerThreadArgs->std;   \
-        IMAGE_T *img = (IMAGE_T *)data->img;                                           \
-        int imageSize = image_size(data->img);                                         \
-        PIXEL_T *sampled = (PIXEL_T *)data->sampled;                                   \
-        for (int y = start_row; y < end_row; ++y)                                      \
-        {                                                                              \
-            for (int x = 0; x < data->new_width; ++x)                                  \
-            {                                                                          \
-                for (char c = 0; c < img->channels; ++c)                               \
-                {                                                                      \
-                    float sum = 0.0;                                                   \
-                    for (int i = -2; i < 3; i++)                                       \
-                    {                                                                  \
-                        for (int j = -2; j < 3; j++)                                   \
-                        {                                                              \
-                            int src_row = 2 * y + i;                                   \
-                            int src_col = 2 * x + j;                                   \
-                            int rr = reflect_index(src_row, img->height);              \
-                            int cc = reflect_index(src_col, img->width);               \
-                            int pos = (cc + rr * img->width) * img->channels + c;      \
-                            if (pos < imageSize)                                       \
-                            {                                                          \
-                                sum += GAUSSIAN_KERNEL[i + 2][j + 2] * img->data[pos]; \
-                            }                                                          \
-                        }                                                              \
-                    }                                                                  \
-                    if (data->image_type == IMAGE)                                     \
-                    {                                                                  \
-                        sum = (PIXEL_T)clamp(ceil(sum), 0, 255);                       \
-                    }                                                                  \
-                    sampled[(y * data->new_width + x) * img->channels + c] = sum;      \
-                }                                                                      \
-            }                                                                          \
-        }                                                                              \
-        return NULL;                                                                   \
-    }
-
-DEFINE_DOWNSAMPLE_WORKER_FUNC(down_sample_operation, Image, unsigned char)
-DEFINE_DOWNSAMPLE_WORKER_FUNC(down_sample_operation_f, ImageF, float)
-DEFINE_DOWNSAMPLE_WORKER_FUNC(down_sample_operation_s, ImageS, short)
-
-#define DEFINE_DOWNSAMPLE_FUNC(NAME, IMAGE_T, PIXEL_T, IMAGE_T_ENUM)                                        \
-    IMAGE_T NAME(IMAGE_T *img)                                                                              \
-    {                                                                                                       \
-        IMAGE_T result;                                                                                     \
-        int new_width = img->width / 2;                                                                     \
-        int new_height = img->height / 2;                                                                   \
-        PIXEL_T *downsampled = (PIXEL_T *)malloc(new_width * new_height * img->channels * sizeof(PIXEL_T)); \
-        if (!downsampled)                                                                                   \
-        {                                                                                                   \
-            return result;                                                                                  \
-        }                                                                                                   \
-        SamplingThreadData std = {0, new_width, new_height, img, downsampled, IMAGE_T_ENUM};                \
-        WorkerThreadArgs wtd;                                                                               \
-        wtd.std = &std;                                                                                     \
-        ParallelOperatorArgs args = {new_height, &wtd};                                                     \
-        parallel_operator(DOWNSAMPLE, &args);                                                               \
-        result.channels = img->channels;                                                                    \
-        result.data = downsampled;                                                                          \
-        result.width = new_width;                                                                           \
-        result.height = new_height;                                                                         \
-        return result;                                                                                      \
-    }
-
-DEFINE_DOWNSAMPLE_FUNC(downsample, Image, unsigned char, IMAGE)
-DEFINE_DOWNSAMPLE_FUNC(downsample_s, ImageS, short, IMAGES)
-DEFINE_DOWNSAMPLE_FUNC(downsample_f, ImageF, float, IMAGEF)
-
-#define DEFINE_UPSAMPLE_WORKER_FUNC(NAME, IMAGE_T, PIXEL_T)                                         \
-    void *NAME(void *args)                                                                          \
-    {                                                                                               \
-        ThreadArgs *arg = (ThreadArgs *)args;                                                       \
-        int start_row = arg->start_index;                                                           \
-        int end_row = arg->end_index;                                                               \
-        SamplingThreadData *s = (SamplingThreadData *)arg->workerThreadArgs->std;                   \
-        IMAGE_T *img = (IMAGE_T *)s->img;                                                           \
-        PIXEL_T *sampled = (PIXEL_T *)s->sampled;                                                   \
-        int pad = 2;                                                                                \
-        for (int y = start_row; y < end_row; ++y)                                                   \
-        {                                                                                           \
-            for (int x = 0; x < s->new_width; ++x)                                                  \
-            {                                                                                       \
-                for (char c = 0; c < img->channels; ++c)                                            \
-                {                                                                                   \
-                    float sum = 0;                                                                  \
-                    for (int ki = 0; ki < 5; ki++)                                                  \
-                    {                                                                               \
-                        for (int kj = 0; kj < 5; kj++)                                              \
-                        {                                                                           \
-                            int src_i = reflect_index(y + ki - pad, s->new_height);                 \
-                            int src_j = reflect_index(x + kj - pad, s->new_width);                  \
-                            int pixel_val = 0;                                                      \
-                            if (src_i % 2 == 0 && src_j % 2 == 0)                                   \
-                            {                                                                       \
-                                int orig_i = src_i / 2;                                             \
-                                int orig_j = src_j / 2;                                             \
-                                int image_pos = (orig_i * img->width + orig_j) * img->channels + c; \
-                                pixel_val = img->data[image_pos] * s->upsample_factor;              \
-                            }                                                                       \
-                            sum += GAUSSIAN_KERNEL[ki][kj] * pixel_val;                             \
-                        }                                                                           \
-                    }                                                                               \
-                    int up_image_pos = (y * s->new_width + x) * img->channels + c;                  \
-                    if (s->image_type == IMAGE)                                                     \
-                    {                                                                               \
-                        sum = (PIXEL_T)clamp(floor(sum + 0.5), 0, 255);                             \
-                    }                                                                               \
-                    sampled[up_image_pos] = sum;                                                    \
-                }                                                                                   \
-            }                                                                                       \
-        }                                                                                           \
-        return NULL;                                                                                \
-    }
-
-DEFINE_UPSAMPLE_WORKER_FUNC(upsample_worker, Image, unsigned char)
-DEFINE_UPSAMPLE_WORKER_FUNC(upsample_worker_s, ImageS, short)
-DEFINE_UPSAMPLE_WORKER_FUNC(upsample_worker_f, ImageF, float)
-
-#define DEFINE_UPSAMPLE_FUNC(NAME, IMAGE_T, PIXEL_T, IMAGE_T_ENUM)                                       \
-    IMAGE_T NAME(IMAGE_T *img, float upsample_factor)                                                    \
-    {                                                                                                    \
-        IMAGE_T result;                                                                                  \
-        int new_width = img->width * 2;                                                                  \
-        int new_height = img->height * 2;                                                                \
-        PIXEL_T *upsampled = (PIXEL_T *)calloc(new_width * new_height * img->channels, sizeof(PIXEL_T)); \
-        if (!upsampled)                                                                                  \
-        {                                                                                                \
-            result.data = NULL;                                                                          \
-            result.width = result.height = result.channels = 0;                                          \
-            return result;                                                                               \
-        }                                                                                                \
-        SamplingThreadData std = {upsample_factor, new_width, new_height, img, upsampled, IMAGE_T_ENUM}; \
-        WorkerThreadArgs wtd;                                                                            \
-        wtd.std = &std;                                                                                  \
-        ParallelOperatorArgs args = {new_height, &wtd};                                                  \
-        parallel_operator(UPSAMPLE, &args);                                                              \
-        result.data = upsampled;                                                                         \
-        result.width = new_width;                                                                        \
-        result.height = new_height;                                                                      \
-        result.channels = img->channels;                                                                 \
-        return result;                                                                                   \
-    }
-
-DEFINE_UPSAMPLE_FUNC(upsample, Image, unsigned char, IMAGE)
-DEFINE_UPSAMPLE_FUNC(upsample_image_s, ImageS, short, IMAGES)
-DEFINE_UPSAMPLE_FUNC(upsample_image_f, ImageF, float, IMAGES)
+void *compute_laplacian_worker(void *args) {
+  ThreadArgs *arg = (ThreadArgs *)args;
+  int start_row = arg->start_index;
+  int end_row = arg->end_index;
+  LaplacianThreadData *l = (LaplacianThreadData *)arg->workerThreadArgs->ltd;
 
-void *compute_laplacian_worker(void *args)
-{
-    ThreadArgs *arg = (ThreadArgs *)args;
-    int start_row = arg->start_index;
-    int end_row = arg->end_index;
-    LaplacianThreadData *l = (LaplacianThreadData *)arg->workerThreadArgs->ltd;
+  for (int i = start_row; i < end_row; ++i) {
+    l->upsampled->data[i] = l->original->data[i] - l->upsampled->data[i];
+  }
 
-    for (int i = start_row; i < end_row; ++i)
-    {
-        l->upsampled->data[i] = l->original->data[i] - l->upsampled->data[i];
-    }
-
-    return NULL;
+  return NULL;
 }
 
-void compute_laplacian(ImageS *original, ImageS *upsampled)
-{
-    int total_size = original->width * original->height * original->channels;
+void compute_laplacian(ImageS *original, ImageS *upsampled) {
+  int total_size = original->width * original->height * original->channels;
 
-    LaplacianThreadData ltd = {original, upsampled, total_size};
-    WorkerThreadArgs wtd;
-    wtd.ltd = &ltd;
-    ParallelOperatorArgs args = {total_size, &wtd};
+  LaplacianThreadData ltd = {original, upsampled, total_size};
+  WorkerThreadArgs wtd;
+  wtd.ltd = &ltd;
+  ParallelOperatorArgs args = {total_size, &wtd};
 
-    parallel_operator(LAPLACIAN, &args);
+  parallel_operator(LAPLACIAN, &args);
 }
 
-void *feed_worker(void *args)
-{
-    ThreadArgs *arg = (ThreadArgs *)args;
-    int start_row = arg->start_index;
-    int end_row = arg->end_index;
-    FeedThreadData *f = (FeedThreadData *)arg->workerThreadArgs->ftd;
+void *feed_worker(void *args) {
+  ThreadArgs *arg = (ThreadArgs *)args;
+  int start_row = arg->start_index;
+  int end_row = arg->end_index;
+  FeedThreadData *f = (FeedThreadData *)arg->workerThreadArgs->ftd;
 
-    for (int k = start_row; k < end_row; ++k)
-    {
-        for (int i = 0; i < f->cols; ++i)
-        {
-            int maskIndex = i + (k * f->level_width);
-            int outMaskLevelIndex = ((i + f->x_tl) + ((k + f->y_tl) * f->out_level_width));
+  for (int k = start_row; k < end_row; ++k) {
+    for (int i = 0; i < f->cols; ++i) {
+      int maskIndex = i + (k * f->level_width);
+      int outMaskLevelIndex =
+          ((i + f->x_tl) + ((k + f->y_tl) * f->out_level_width));
 
-            for (char z = 0; z < CHANNELS; ++z)
-            {
-                int imgIndex = ((i + (k * f->level_width)) * CHANNELS) + z;
+      for (char z = 0; z < CHANNELS; ++z) {
+        int imgIndex = ((i + (k * f->level_width)) * CHANNELS) + z;
 
-                if (imgIndex < f->img_laplacians[f->level].width * f->img_laplacians[f->level].height * CHANNELS &&
-                    maskIndex < f->mask_gaussian[f->level].width * f->mask_gaussian[f->level].height)
-                {
+        if (imgIndex < f->img_laplacians[f->level].width *
+                           f->img_laplacians[f->level].height * CHANNELS &&
+            maskIndex < f->mask_gaussian[f->level].width *
+                            f->mask_gaussian[f->level].height) {
 
-                    int outLevelIndex = ((i + f->x_tl) + (k + f->y_tl) * f->out_level_width) * CHANNELS + z;
+          int outLevelIndex =
+              ((i + f->x_tl) + (k + f->y_tl) * f->out_level_width) * CHANNELS +
+              z;
 
-                    float maskVal = f->mask_gaussian[f->level].data[maskIndex];
-                    float imgVal = f->img_laplacians[f->level].data[imgIndex];
+          float maskVal = f->mask_gaussian[f->level].data[maskIndex];
+          float imgVal = f->img_laplacians[f->level].data[imgIndex];
 
-                    maskVal = maskVal * (1.0 / 255.0);
+          maskVal = maskVal * (1.0 / 255.0);
 
-                    imgVal = imgVal * maskVal;
+          imgVal = imgVal * maskVal;
 
-                    if (outLevelIndex < f->out_level_height * f->out_level_width * CHANNELS &&
-                        outMaskLevelIndex < f->out_level_height * f->out_level_width)
-                    {
-                        f->out[f->level].data[outLevelIndex] += imgVal;
+          if (outLevelIndex <
+                  f->out_level_height * f->out_level_width * CHANNELS &&
+              outMaskLevelIndex < f->out_level_height * f->out_level_width) {
+            f->out[f->level].data[outLevelIndex] += imgVal;
 
-                        if (z == 0)
-                        {
-                            f->out_mask[f->level].data[outMaskLevelIndex] += maskVal;
-                        }
-                    }
-                }
+            if (z == 0) {
+              f->out_mask[f->level].data[outMaskLevelIndex] += maskVal;
             }
+          }
         }
+      }
     }
+  }
 
-    return NULL;
+  return NULL;
 }
 
-int feed(Blender *b, Image *img, Image *mask_img, Point tl)
-{
-    ImageS images[b->num_bands + 1];
-    int return_val = 1;
-
-    int gap = 3 * (1 << b->num_bands);
-    Point tl_new, br_new;
+int feed(Blender *b, Image *img, Image *mask_img, Point tl) {
+  ImageS images[b->num_bands + 1];
+  int return_val = 1;
 
-    tl_new.x = max(b->output_size.x, tl.x - gap);
-    tl_new.y = max(b->output_size.y, tl.y - gap);
+  int gap = 3 * (1 << b->num_bands);
+  Point tl_new, br_new;
 
-    Point br_point = br(b->output_size);
-    br_new.x = min(br_point.x, tl.x + img->width + gap);
-    br_new.y = min(br_point.y, tl.y + img->height + gap);
+  tl_new.x = max(b->output_size.x, tl.x - gap);
+  tl_new.y = max(b->output_size.y, tl.y - gap);
 
-    tl_new.x = b->output_size.x + (((tl_new.x - b->output_size.x) >> b->num_bands) << b->num_bands);
-    tl_new.y = b->output_size.y + (((tl_new.y - b->output_size.y) >> b->num_bands) << b->num_bands);
+  Point br_point = br(b->output_size);
+  br_new.x = min(br_point.x, tl.x + img->width + gap);
+  br_new.y = min(br_point.y, tl.y + img->height + gap);
 
-    int width = br_new.x - tl_new.x;
-    int height = br_new.y - tl_new.y;
+  tl_new.x = b->output_size.x +
+             (((tl_new.x - b->output_size.x) >> b->num_bands) << b->num_bands);
+  tl_new.y = b->output_size.y +
+             (((tl_new.y - b->output_size.y) >> b->num_bands) << b->num_bands);
 
-    width += ((1 << b->num_bands) - width % (1 << b->num_bands)) % (1 << b->num_bands);
-    height += ((1 << b->num_bands) - height % (1 << b->num_bands)) % (1 << b->num_bands);
+  int width = br_new.x - tl_new.x;
+  int height = br_new.y - tl_new.y;
 
-    br_new.x = tl_new.x + width;
-    br_new.y = tl_new.y + height;
+  width +=
+      ((1 << b->num_bands) - width % (1 << b->num_bands)) % (1 << b->num_bands);
+  height += ((1 << b->num_bands) - height % (1 << b->num_bands)) %
+            (1 << b->num_bands);
 
-    int dx = max(br_new.x - br_point.x, 0);
-    int dy = max(br_new.y - br_point.y, 0);
+  br_new.x = tl_new.x + width;
+  br_new.y = tl_new.y + height;
 
-    tl_new.x -= dx;
-    br_new.x -= dx;
-    tl_new.y -= dy;
-    br_new.y -= dy;
+  int dx = max(br_new.x - br_point.x, 0);
+  int dy = max(br_new.y - br_point.y, 0);
 
-    int top = tl.y - tl_new.y;
-    int left = tl.x - tl_new.x;
-    int bottom = br_new.y - tl.y - img->height;
-    int right = br_new.x - tl.x - img->width;
+  tl_new.x -= dx;
+  br_new.x -= dx;
+  tl_new.y -= dy;
+  br_new.y -= dy;
 
-    add_border_to_image(img, top, bottom, left, right, CHANNELS, BORDER_REFLECT);
-    add_border_to_image(mask_img, top, bottom, left, right, 1, BORDER_CONSTANT);
+  int top = tl.y - tl_new.y;
+  int left = tl.x - tl_new.x;
+  int bottom = br_new.y - tl.y - img->height;
+  int right = br_new.x - tl.x - img->width;
 
-    images[0] = create_empty_image_s(img->width, img->height, img->channels);
-    convert_image_to_image_s(img, &images[0]);
-    for (int j = 0; j < b->num_bands; ++j)
-    {
-        images[j + 1] = downsample_s(&images[j]);
-        if (!images[j + 1].data)
-        {
-            return_val = 0;
-            goto clean;
-        }
-
-        b->img_laplacians[j] = upsample_image_s(&images[j + 1], 4.f);
-        if (!&b->img_laplacians[j])
-        {
-            return_val = 0;
-            goto clean;
-        }
+  add_border_to_image(img, top, bottom, left, right, CHANNELS, BORDER_REFLECT);
+  add_border_to_image(mask_img, top, bottom, left, right, 1, BORDER_CONSTANT);
 
-        compute_laplacian(&images[j], &b->img_laplacians[j]);
+  images[0] = create_empty_image_s(img->width, img->height, img->channels);
+  convert_image_to_image_s(img, &images[0]);
+  for (int j = 0; j < b->num_bands; ++j) {
+    images[j + 1] = downsample_s(&images[j]);
+    if (!images[j + 1].data) {
+      return_val = 0;
+      goto clean;
     }
 
-    b->img_laplacians[b->num_bands] = images[b->num_bands];
-    ImageS sampled;
-    ImageS mask_img_ = create_empty_image_s(mask_img->width, mask_img->height, mask_img->channels);
-    convert_image_to_image_s(mask_img, &mask_img_);
-    for (int j = 0; j < b->num_bands; ++j)
-    {
-        b->mask_gaussian[j] = mask_img_;
-        sampled = downsample_s(&mask_img_);
-        if (!sampled.data)
-        {
-            return_val = 0;
-            goto clean;
-        }
-        mask_img_ = sampled;
+    b->img_laplacians[j] = upsample_image_s(&images[j + 1], 4.f);
+    if (!&b->img_laplacians[j]) {
+      return_val = 0;
+      goto clean;
     }
 
-    b->mask_gaussian[b->num_bands] = mask_img_;
-
-    int y_tl = tl_new.y - b->output_size.y;
-    int y_br = br_new.y - b->output_size.y;
-    int x_tl = tl_new.x - b->output_size.x;
-    int x_br = br_new.x - b->output_size.x;
-
-    for (int level = 0; level <= b->num_bands; ++level)
-    {
-
-        int rows = (y_br - y_tl);
-        int cols = (x_br - x_tl);
-
-        FeedThreadData ftd;
-        ftd.rows = rows;
-        ftd.cols = cols;
-        ftd.x_tl = x_tl;
-        ftd.y_tl = y_tl;
-        ftd.out_level_width = b->out_width_levels[level];
-        ftd.out_level_height = b->out_height_levels[level];
-        ftd.level_width = b->img_laplacians[level].width;
-        ftd.level_height = b->img_laplacians[level].height;
-        ftd.level = level;
-        ftd.img_laplacians = b->img_laplacians;
-        ftd.mask_gaussian = b->mask_gaussian;
-        ftd.out = b->out;
-        ftd.out_mask = b->out_mask;
-
-        WorkerThreadArgs wtd;
-        wtd.ftd = &ftd;
-        ParallelOperatorArgs args = {rows, &wtd};
-
-        parallel_operator(FEED, &args);
-
-        x_tl /= 2;
-        y_tl /= 2;
-        x_br /= 2;
-        y_br /= 2;
+    compute_laplacian(&images[j], &b->img_laplacians[j]);
+  }
+
+  b->img_laplacians[b->num_bands] = images[b->num_bands];
+  ImageS sampled;
+  ImageS mask_img_ = create_empty_image_s(mask_img->width, mask_img->height,
+                                          mask_img->channels);
+  convert_image_to_image_s(mask_img, &mask_img_);
+  for (int j = 0; j < b->num_bands; ++j) {
+    b->mask_gaussian[j] = mask_img_;
+    sampled = downsample_s(&mask_img_);
+    if (!sampled.data) {
+      return_val = 0;
+      goto clean;
     }
+    mask_img_ = sampled;
+  }
+
+  b->mask_gaussian[b->num_bands] = mask_img_;
+
+  int y_tl = tl_new.y - b->output_size.y;
+  int y_br = br_new.y - b->output_size.y;
+  int x_tl = tl_new.x - b->output_size.x;
+  int x_br = br_new.x - b->output_size.x;
+
+  for (int level = 0; level <= b->num_bands; ++level) {
+
+    int rows = (y_br - y_tl);
+    int cols = (x_br - x_tl);
+
+    FeedThreadData ftd;
+    ftd.rows = rows;
+    ftd.cols = cols;
+    ftd.x_tl = x_tl;
+    ftd.y_tl = y_tl;
+    ftd.out_level_width = b->out_width_levels[level];
+    ftd.out_level_height = b->out_height_levels[level];
+    ftd.level_width = b->img_laplacians[level].width;
+    ftd.level_height = b->img_laplacians[level].height;
+    ftd.level = level;
+    ftd.img_laplacians = b->img_laplacians;
+    ftd.mask_gaussian = b->mask_gaussian;
+    ftd.out = b->out;
+    ftd.out_mask = b->out_mask;
+
+    WorkerThreadArgs wtd;
+    wtd.ftd = &ftd;
+    ParallelOperatorArgs args = {rows, &wtd};
+
+    parallel_operator(FEED, &args);
+
+    x_tl /= 2;
+    y_tl /= 2;
+    x_br /= 2;
+    y_br /= 2;
+  }
 clean:
-    for (size_t i = 0; i <= b->num_bands; i++)
-    {
-        destroy_image_s(&images[i]);
-    }
+  for (size_t i = 0; i <= b->num_bands; i++) {
+    destroy_image_s(&images[i]);
+  }
 
-    return return_val;
+  return return_val;
 }
 
-void *blend_worker(void *args)
-{
-    ThreadArgs *arg = (ThreadArgs *)args;
-    int start_row = arg->start_index;
-    int end_row = arg->end_index;
-    BlendThreadData *b = (BlendThreadData *)arg->workerThreadArgs->btd;
-    for (int i = start_row; i < end_row; ++i)
-    {
-        if (i < b->out_size)
-        {
-            b->blended_image.data[i] = b->blended_image.data[i] + b->out_level.data[i];
-        }
+void *blend_worker(void *args) {
+  ThreadArgs *arg = (ThreadArgs *)args;
+  int start_row = arg->start_index;
+  int end_row = arg->end_index;
+  BlendThreadData *b = (BlendThreadData *)arg->workerThreadArgs->btd;
+  for (int i = start_row; i < end_row; ++i) {
+    if (i < b->out_size) {
+      b->blended_image.data[i] =
+          b->blended_image.data[i] + b->out_level.data[i];
     }
-    return NULL;
+  }
+  return NULL;
 }
 
-void *normalize_worker(void *args)
-{
-    ThreadArgs *arg = (ThreadArgs *)args;
-    int start_row = arg->start_index;
-    int end_row = arg->end_index;
-    NormalThreadData *n = (NormalThreadData *)arg->workerThreadArgs->ntd;
-
-    for (int y = start_row; y < end_row; ++y)
-    {
-        for (int x = 0; x < n->output_width; ++x)
-        {
-            int maskIndex = x + (y * n->output_width);
-            if (maskIndex < image_size_f(&n->out_mask[n->level]))
-            {
-                float w = n->out_mask[n->level].data[maskIndex];
-
-                for (char z = 0; z < CHANNELS; z++)
-                {
-                    int imgIndex = (x + (y * n->output_width)) * CHANNELS + z;
-                    if (imgIndex < image_size_s(&n->final_out[n->level]))
-                    {
-
-                        n->final_out[n->level].data[imgIndex] = (short)(n->out[n->level].data[imgIndex] / (w + WEIGHT_EPS));
-                    }
-                }
-            }
+void *normalize_worker(void *args) {
+  ThreadArgs *arg = (ThreadArgs *)args;
+  int start_row = arg->start_index;
+  int end_row = arg->end_index;
+  NormalThreadData *n = (NormalThreadData *)arg->workerThreadArgs->ntd;
+
+  for (int y = start_row; y < end_row; ++y) {
+    for (int x = 0; x < n->output_width; ++x) {
+      int maskIndex = x + (y * n->output_width);
+      if (maskIndex < image_size_f(&n->out_mask[n->level])) {
+        float w = n->out_mask[n->level].data[maskIndex];
+
+        for (char z = 0; z < CHANNELS; z++) {
+          int imgIndex = (x + (y * n->output_width)) * CHANNELS + z;
+          if (imgIndex < image_size_s(&n->final_out[n->level])) {
+
+            n->final_out[n->level].data[imgIndex] =
+                (short)(n->out[n->level].data[imgIndex] / (w + WEIGHT_EPS));
+          }
         }
+      }
     }
-    return NULL;
+  }
+  return NULL;
 }
 
-void blend(Blender *b)
-{
-    for (int level = 0; level <= b->num_bands; ++level)
-    {
-        b->final_out[level] = create_empty_image_s(b->out[level].width, b->out[level].height, b->out[level].channels);
-
-        NormalThreadData ntd = {b->out[level].width, level, b->out, b->out_mask, b->final_out};
-        WorkerThreadArgs wtd;
-        wtd.ntd = &ntd;
-        ParallelOperatorArgs args = {b->out[level].height, &wtd};
-
-        parallel_operator(NORMALIZE, &args);
-        destroy_image_f(&b->out[level]);
-        if (level > 0)
-        {
-            destroy_image_f(&b->out_mask[level]);
-        }
-    }
-
-    ImageS blended_image = b->final_out[b->num_bands];
+void blend(Blender *b) {
+  for (int level = 0; level <= b->num_bands; ++level) {
+    b->final_out[level] = create_empty_image_s(
+        b->out[level].width, b->out[level].height, b->out[level].channels);
 
-    for (int level = b->num_bands; level > 0; --level)
-    {
-        blended_image = upsample_image_s(&blended_image, 4.f);
-        int out_size = image_size_s(&b->final_out[level - 1]);
+    NormalThreadData ntd = {b->out[level].width, level, b->out, b->out_mask,
+                            b->final_out};
+    WorkerThreadArgs wtd;
+    wtd.ntd = &ntd;
+    ParallelOperatorArgs args = {b->out[level].height, &wtd};
 
-        BlendThreadData btd = {out_size, blended_image, b->final_out[level - 1]};
-        WorkerThreadArgs wtd;
-        wtd.btd = &btd;
-        ParallelOperatorArgs args = {out_size, &wtd};
-        parallel_operator(BLEND, &args);
+    parallel_operator(NORMALIZE, &args);
+    destroy_image_f(&b->out[level]);
+    if (level > 0) {
+      destroy_image_f(&b->out_mask[level]);
     }
+  }
 
-    b->result.data = (unsigned char *)malloc(b->output_size.width * b->output_size.height * CHANNELS * sizeof(unsigned char));
-    b->result.channels = blended_image.channels;
-    b->result.width = blended_image.width;
-    b->result.height = blended_image.height;
-
-    convert_images_to_image(&blended_image, &b->result);
-
-    for (size_t i = 0; i < b->result.height; i++)
-    {
-        for (size_t j = 0; j < b->result.width; j++)
-        {
-            int pos = j + (i * b->result.width);
-            float w = b->out_mask[0].data[pos];
-            if (w <= WEIGHT_EPS)
-            {
-                int imgPos = (j + (i * b->result.width)) * CHANNELS;
-                for (char c = 0; c < CHANNELS; c++)
-                {
-                    b->result.data[imgPos + c] = 0;
-                }
-            }
+  ImageS blended_image = b->final_out[b->num_bands];
+
+  for (int level = b->num_bands; level > 0; --level) {
+    blended_image = upsample_image_s(&blended_image, 4.f);
+    int out_size = image_size_s(&b->final_out[level - 1]);
+
+    BlendThreadData btd = {out_size, blended_image, b->final_out[level - 1]};
+    WorkerThreadArgs wtd;
+    wtd.btd = &btd;
+    ParallelOperatorArgs args = {out_size, &wtd};
+    parallel_operator(BLEND, &args);
+  }
+
+  b->result.data =
+      (unsigned char *)malloc(b->output_size.width * b->output_size.height *
+                              CHANNELS * sizeof(unsigned char));
+  b->result.channels = blended_image.channels;
+  b->result.width = blended_image.width;
+  b->result.height = blended_image.height;
+
+  convert_images_to_image(&blended_image, &b->result);
+
+  for (size_t i = 0; i < b->result.height; i++) {
+    for (size_t j = 0; j < b->result.width; j++) {
+      int pos = j + (i * b->result.width);
+      float w = b->out_mask[0].data[pos];
+      if (w <= WEIGHT_EPS) {
+        int imgPos = (j + (i * b->result.width)) * CHANNELS;
+        for (char c = 0; c < CHANNELS; c++) {
+          b->result.data[imgPos + c] = 0;
         }
+      }
     }
+  }
 
-    crop_image_buf(&b->result,
-                   0,
-                   max(0, b->result.height - b->real_out_size.height),
-                   0,
-                   max(0, b->result.width - b->real_out_size.width),
-                   CHANNELS);
-    free(blended_image.data);
+  crop_image_buf(&b->result, 0,
+                 max(0, b->result.height - b->real_out_size.height), 0,
+                 max(0, b->result.width - b->real_out_size.width), CHANNELS);
+  free(blended_image.data);
 }
 
-void parallel_operator(OperatorType operatorType, ParallelOperatorArgs *arg)
-{
-    int numThreads = get_cpus_count();
-    int rowsPerThread = arg->rows / numThreads;
-    int remainingRows = arg->rows % numThreads;
-
-    pthread_t threads[numThreads];
-    ThreadArgs thread_data[numThreads];
-
-    int startRow = 0;
-    for (unsigned int i = 0; i < numThreads; ++i)
-    {
-        int endRow = startRow + rowsPerThread + (remainingRows > 0 ? 1 : 0);
-        if (remainingRows > 0)
-        {
-            --remainingRows;
-        }
+void parallel_operator(OperatorType operatorType, ParallelOperatorArgs *arg) {
+  int numThreads = get_cpus_count();
+  int rowsPerThread = arg->rows / numThreads;
+  int remainingRows = arg->rows % numThreads;
 
-        switch (operatorType)
-        {
-        case UPSAMPLE:
-        case DOWNSAMPLE:
-            thread_data[i].end_index = endRow;
-            thread_data[i].start_index = startRow;
-            thread_data[i].workerThreadArgs = arg->workerThreadArgs;
-            if (operatorType == DOWNSAMPLE)
-            {
-                switch (arg->workerThreadArgs->std->image_type)
-                {
-                case IMAGE:
-                    pthread_create(&threads[i], NULL, down_sample_operation, &thread_data[i]);
-                    break;
-                case IMAGES:
-                    pthread_create(&threads[i], NULL, down_sample_operation_s, &thread_data[i]);
-                    break;
-                case IMAGEF:
-                    pthread_create(&threads[i], NULL, down_sample_operation_f, &thread_data[i]);
-                    break;
-                default:
-                    break;
-                }
-            }
-            else
-            {
-                switch (arg->workerThreadArgs->std->image_type)
-                {
-                case IMAGE:
-                    pthread_create(&threads[i], NULL, upsample_worker, &thread_data[i]);
-                    break;
-                case IMAGES:
-                    pthread_create(&threads[i], NULL, upsample_worker_s, &thread_data[i]);
-                    break;
-                case IMAGEF:
-                    pthread_create(&threads[i], NULL, upsample_worker_f, &thread_data[i]);
-                    break;
-                default:
-                    break;
-                }
-            }
+  pthread_t threads[numThreads];
+  ThreadArgs thread_data[numThreads];
 
-            break;
-        case FEED:
-            thread_data[i].end_index = endRow;
-            thread_data[i].start_index = startRow;
-            thread_data[i].workerThreadArgs = arg->workerThreadArgs;
-            pthread_create(&threads[i], NULL, feed_worker, &thread_data[i]);
-            break;
-        case LAPLACIAN:
-            thread_data[i].end_index = endRow;
-            thread_data[i].start_index = startRow;
-            thread_data[i].workerThreadArgs = arg->workerThreadArgs;
-            pthread_create(&threads[i], NULL, compute_laplacian_worker, &thread_data[i]);
-            break;
-        case BLEND:
-            thread_data[i].end_index = endRow;
-            thread_data[i].start_index = startRow;
-            thread_data[i].workerThreadArgs = arg->workerThreadArgs;
-            pthread_create(&threads[i], NULL, blend_worker, &thread_data[i]);
-            break;
-        case NORMALIZE:
-            thread_data[i].end_index = endRow;
-            thread_data[i].start_index = startRow;
-            thread_data[i].workerThreadArgs = arg->workerThreadArgs;
-            pthread_create(&threads[i], NULL, normalize_worker, &thread_data[i]);
-            break;
-        }
-
-        startRow = endRow;
+  int startRow = 0;
+  for (unsigned int i = 0; i < numThreads; ++i) {
+    int endRow = startRow + rowsPerThread + (remainingRows > 0 ? 1 : 0);
+    if (remainingRows > 0) {
+      --remainingRows;
     }
 
-    for (unsigned int i = 0; i < numThreads; ++i)
-    {
-        pthread_join(threads[i], NULL);
+    switch (operatorType) {
+    case UPSAMPLE:
+    case DOWNSAMPLE:
+      thread_data[i].end_index = endRow;
+      thread_data[i].start_index = startRow;
+      thread_data[i].workerThreadArgs = arg->workerThreadArgs;
+      if (operatorType == DOWNSAMPLE) {
+        switch (arg->workerThreadArgs->std->image_type) {
+        case IMAGE:
+          pthread_create(&threads[i], NULL, down_sample_operation,
+                         &thread_data[i]);
+          break;
+        case IMAGES:
+          pthread_create(&threads[i], NULL, down_sample_operation_s,
+                         &thread_data[i]);
+          break;
+        case IMAGEF:
+          pthread_create(&threads[i], NULL, down_sample_operation_f,
+                         &thread_data[i]);
+          break;
+        default:
+          break;
+        }
+      } else {
+        switch (arg->workerThreadArgs->std->image_type) {
+        case IMAGE:
+          pthread_create(&threads[i], NULL, upsample_worker, &thread_data[i]);
+          break;
+        case IMAGES:
+          pthread_create(&threads[i], NULL, upsample_worker_s, &thread_data[i]);
+          break;
+        case IMAGEF:
+          pthread_create(&threads[i], NULL, upsample_worker_f, &thread_data[i]);
+          break;
+        default:
+          break;
+        }
+      }
+
+      break;
+    case FEED:
+      thread_data[i].end_index = endRow;
+      thread_data[i].start_index = startRow;
+      thread_data[i].workerThreadArgs = arg->workerThreadArgs;
+      pthread_create(&threads[i], NULL, feed_worker, &thread_data[i]);
+      break;
+    case LAPLACIAN:
+      thread_data[i].end_index = endRow;
+      thread_data[i].start_index = startRow;
+      thread_data[i].workerThreadArgs = arg->workerThreadArgs;
+      pthread_create(&threads[i], NULL, compute_laplacian_worker,
+                     &thread_data[i]);
+      break;
+    case BLEND:
+      thread_data[i].end_index = endRow;
+      thread_data[i].start_index = startRow;
+      thread_data[i].workerThreadArgs = arg->workerThreadArgs;
+      pthread_create(&threads[i], NULL, blend_worker, &thread_data[i]);
+      break;
+    case NORMALIZE:
+      thread_data[i].end_index = endRow;
+      thread_data[i].start_index = startRow;
+      thread_data[i].workerThreadArgs = arg->workerThreadArgs;
+      pthread_create(&threads[i], NULL, normalize_worker, &thread_data[i]);
+      break;
     }
+
+    startRow = endRow;
+  }
+
+  for (unsigned int i = 0; i < numThreads; ++i) {
+    pthread_join(threads[i], NULL);
+  }
 }
diff --git a/laplace_blending.h b/laplace_blending.h
index cbb93c6..c272d94 100644
--- a/laplace_blending.h
+++ b/laplace_blending.h
@@ -1,32 +1,4 @@
-#include "jpeg.h"
-
-#define MAX_BANDS 7
-typedef enum
-{
-    DOWNSAMPLE,
-    UPSAMPLE,
-    LAPLACIAN,
-    FEED,
-    BLEND,
-    NORMALIZE
-} OperatorType;
-
-typedef struct
-{
-    int x;
-    int y;
-} Point;
-
-typedef struct
-{
-    int x;
-    int y;
-    int width;
-    int height;
-} Rect;
-
-Point br(Rect r);
-
+#include "image_operations.h"
 
 typedef struct
 {
@@ -43,93 +15,7 @@ typedef struct
     ImageS *mask_gaussian;
 } Blender;
 
-typedef struct
-{
-    float upsample_factor;
-    int new_width;
-    int new_height;
-    void *img;
-    void *sampled;
-    ImageType image_type;
-} SamplingThreadData;
-
-typedef struct
-{
-    ImageS *original;
-    ImageS *upsampled;
-    int total_size;
-} LaplacianThreadData;
-
-typedef struct
-{
-    int rows;
-    int cols;
-    int x_tl;
-    int y_tl;
-    int out_level_width;
-    int out_level_height;
-    int level_width;
-    int level_height;
-    int level;
-    ImageS *img_laplacians;
-    ImageS *mask_gaussian;
-    ImageF *out;
-    ImageF *out_mask;
-} FeedThreadData;
-
-typedef struct
-{
-    int output_width;
-    int level;
-    ImageF *out;
-    ImageF *out_mask;
-    ImageS *final_out;
-} NormalThreadData;
-
-typedef struct
-{
-    int out_size;
-    ImageS blended_image;
-    ImageS out_level;
-} BlendThreadData;
-
-typedef union
-{
-    SamplingThreadData *std;
-    LaplacianThreadData *ltd;
-    FeedThreadData *ftd;
-    BlendThreadData *btd;
-    NormalThreadData *ntd;
-} WorkerThreadArgs;
-
-typedef struct
-{
-    int rows;
-    WorkerThreadArgs *workerThreadArgs;
-} ParallelOperatorArgs;
-
-typedef struct
-{
-    int start_index;
-    int end_index;
-    WorkerThreadArgs *workerThreadArgs;
-} ThreadArgs;
-
 Blender *create_blender(Rect out_size, int nb);
 int feed(Blender *b, Image *img, Image *maskImg, Point tl);
 void blend(Blender *b);
 void destroy_blender(Blender *blender);
-Image create_image(const char *filename);
-Image create_empty_image(int width, int height, int channels);
-ImageS create_empty_image_s(int width, int height, int channels);
-ImageF create_empty_image_f(int width, int height, int channels);
-Image create_image_mask(int width, int height, float range, int left, int right);
-int save_image(const Image *img, char *out_filename);
-int image_size(Image *img);
-void destroy_image(Image *img);
-void destroy_image_s(ImageS *img);
-Image upsample( Image *img,float upsample_factor);
-Image downsample( Image *img);
-ImageS downsample_s( ImageS *img);
-void crop_image(Image *img, int cut_top, int cut_bottom, int cut_left, int cut_right);
-void parallel_operator(OperatorType operatorType, ParallelOperatorArgs *arg);
diff --git a/tests/test.c b/tests/test.c
index 94528c3..57978ea 100644
--- a/tests/test.c
+++ b/tests/test.c
@@ -3,7 +3,7 @@
 #include <time.h>
 #include <string.h>
 #include <stdlib.h>
-#include "laplace_blending.h"
+#include "image_operations.h"
 #include "utils.h"
 
 void test_sampling_operations()
@@ -83,4 +83,4 @@ int main()
     destroy_image_s(&ds);
 
     return 0;
-}
\ No newline at end of file
+}

From dffaa155ad114f7c5fabb9921068df0a4ec67c5e Mon Sep 17 00:00:00 2001
From: Akerele Abraham <abrahamakerele38@gmail.com>
Date: Mon, 26 May 2025 12:09:17 +0100
Subject: [PATCH 3/5] SIMD downsampling of uchar images

---
 .github/workflows/main.yml |   2 +
 .gitignore                 |   6 +-
 README.md                  |   2 +-
 image_operations.c         | 356 +++++++++++++++++++++++++++++++++++--
 jpeg.c                     |   5 +-
 jpeg.h                     |   5 +-
 laplace_blending.c         |  24 +--
 tests/test.c               | 164 ++++++++++-------
 utils.c                    |  21 +--
 9 files changed, 471 insertions(+), 114 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 232e480..f80d0c5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -14,6 +14,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
+        with:
+          submodules: true
 
       - name: Install dependencies (Linux)
         if: runner.os == 'Linux'
diff --git a/.gitignore b/.gitignore
index 10c42f0..6bbc2e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 !files/apple.jpeg
-!files/orange.jpeg 
+!files/orange.jpeg
 files/*
 *.jpeg
 *.jpg
@@ -15,4 +15,6 @@ installs/*
 tests/test
 *.dSYM*
 downsampled
-test
\ No newline at end of file
+test
+example.c
+example
diff --git a/README.md b/README.md
index 2da0c3b..8cae864 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ To verify the functionality of **NativeSticher**, follow the instructions below
 ### 1. Testing with libturbojpeg (Direct Compilation)
 If you have `libturbojpeg` installed, compile and run the test with the following command:
 ```bash
-gcc-14 -pthread -fsanitize=address -g -o stitch \
+gcc-14 -O3 -mavx2 -mfma -I simde/ -pthread -fsanitize=address -g -o stitch \
 -I../ -I/usr/local/include \
 -L/usr/local/lib -lturbojpeg \
 stitch.c ../laplace_blending.c ../jpeg.c ../image_operations.c ../utils.c && time ./stitch
diff --git a/image_operations.c b/image_operations.c
index 5ed16a3..2d6afb7 100644
--- a/image_operations.c
+++ b/image_operations.c
@@ -1,6 +1,8 @@
 
 #include "image_operations.h"
-#include "installs/native-stitcher/macos/x86_64/include/jpeg.h"
+#include "jpeg.h"
+#include "simde/simde/x86/avx2.h"
+#include <stdio.h>
 
 Image create_image(const char *filename) { return decompress_jpeg(filename); }
 
@@ -27,7 +29,7 @@ Image create_image_mask(int width, int height, float range, int left,
 }
 
 int save_image(const Image *img, char *out_filename) {
-  if (img->channels == CHANNELS) {
+  if (img->channels == RGB_CHANNELS) {
     return compress_jpeg(out_filename, img, 100);
   } else {
     return compress_grayscale_jpeg(out_filename, img, 100);
@@ -73,6 +75,307 @@ Point br(Rect r) {
   return result;
 }
 
+int convolve_1d_v_simd(int x, int width, int *row0, int *row1, int *row2,
+                       int *row3, int *row4, unsigned char *out_row) {
+
+  // v0 + v4 + 2v2 + 4(v1 + v3 + v2)
+  for (; x < width - 8; x += 8) {
+    simde__m256i r0 = simde_mm256_loadu_si256((const simde__m256i *)(row0 + x));
+    simde__m256i r1 = simde_mm256_loadu_si256((const simde__m256i *)(row1 + x));
+    simde__m256i r2 = simde_mm256_loadu_si256((const simde__m256i *)(row2 + x));
+    simde__m256i r3 = simde_mm256_loadu_si256((const simde__m256i *)(row3 + x));
+    simde__m256i r4 = simde_mm256_loadu_si256((const simde__m256i *)(row4 + x));
+
+    simde__m256i sum = simde_mm256_add_epi32(r0, r4);
+    sum = simde_mm256_add_epi32(sum, simde_mm256_slli_epi32(r2, 1));
+    simde__m256i t = simde_mm256_add_epi32(simde_mm256_add_epi32(r1, r3), r2);
+    sum = simde_mm256_add_epi32(sum, simde_mm256_slli_epi32(t, 2));
+
+    const simde__m256i bias = simde_mm256_set1_epi32(128);
+    sum = simde_mm256_add_epi32(sum, bias);
+    sum = simde_mm256_srli_epi32(sum, 8);
+
+    simde__m128i lo16 = simde_mm256_castsi256_si128(sum);
+    simde__m128i hi16 = simde_mm256_extracti128_si256(sum, 1);
+    simde__m128i packed = simde_mm_packs_epi32(lo16, hi16);
+    simde__m128i out8 = simde_mm_packus_epi16(packed, packed);
+    simde_mm_storeu_si128((simde__m128i *)out_row, out8);
+
+    out_row += 8;
+  }
+
+  return x;
+}
+
+int convolve_1d_3c(int x, int width, unsigned char *cur_src, int src_width,
+                   int *temp_out) {
+  for (; x < width; ++x) {
+    int xx = (x * 2);
+    for (int c = 0; c < RGB_CHANNELS; c++) {
+      int p0 = reflect_index(xx - 2, src_width),
+          p1 = reflect_index(xx - 1, src_width), p2 = xx,
+          p3 = reflect_index(xx + 1, src_width),
+          p4 = reflect_index(xx + 2, src_width);
+      int s0 = cur_src[p0 * RGB_CHANNELS + c];
+      int s1 = cur_src[p1 * RGB_CHANNELS + c];
+      int s2 = cur_src[p2 * RGB_CHANNELS + c];
+      int s3 = cur_src[p3 * RGB_CHANNELS + c];
+      int s4 = cur_src[p4 * RGB_CHANNELS + c];
+
+      temp_out[0] = s0 + s1 * 4 + s2 * 6 + s3 * 4 + s4;
+      ++temp_out;
+    }
+  }
+  return x;
+}
+
+void char_convolve_3(int range_start, int range_end, int src_width,
+                     int src_height, unsigned char *src, unsigned char *dst) {
+
+  int y = range_start;
+  int yy = y * 2;
+  int width = src_width / 2, height = src_height / 2;
+
+  unsigned char *rows[5] = {
+      src + (reflect_index(yy - 2, src_height)) * src_width * RGB_CHANNELS,
+      src + (reflect_index(yy - 1, src_height)) * src_width * RGB_CHANNELS,
+      src + yy * src_width * RGB_CHANNELS,
+      src + (reflect_index(yy + 1, src_height)) * src_width * RGB_CHANNELS,
+      src + (reflect_index(yy + 2, src_height)) * src_width * RGB_CHANNELS};
+
+  int *temp_dst_out = (int *)malloc(5 * width * RGB_CHANNELS * sizeof(int));
+  if (!temp_dst_out)
+    return;
+
+  int cache[16];
+
+  int *temp_dst_rows[5] = {temp_dst_out, temp_dst_out + (width * RGB_CHANNELS),
+                           temp_dst_out + (2 * width * RGB_CHANNELS),
+                           temp_dst_out + (3 * width * RGB_CHANNELS),
+                           temp_dst_out + (4 * width * RGB_CHANNELS)};
+  int s_y = -2;
+  int e_y = 3;
+
+  for (; y < range_end; y++) {
+
+    for (; s_y < e_y; s_y++) {
+      unsigned char *cur_src = rows[s_y + 2];
+      int *temp_out = temp_dst_rows[s_y + 2];
+      int x = 0;
+      const unsigned char *src0 = cur_src;
+      const unsigned char *src1 = cur_src + 3;
+      const unsigned char *src2 = cur_src + 6;
+      const unsigned char *src3 = cur_src + 9;
+      const unsigned char *src4 = cur_src + 12;
+
+      x = convolve_1d_3c(x, min(3, width), cur_src, src_width, temp_out);
+      temp_out = temp_out + (x * 3);
+
+      for (; x <= width - 3; x += 3) {
+        simde__m256i a_16 = simde_mm256_cvtepu8_epi16(
+            simde_mm_loadu_si128((const simde__m128i *)src0));
+
+        simde__m256i b_16 = simde_mm256_slli_epi16(
+            simde_mm256_cvtepu8_epi16(
+                simde_mm_loadu_si128((const simde__m128i *)src1)),
+            2);
+
+        simde__m256i c_16 = simde_mm256_cvtepu8_epi16(
+            simde_mm_loadu_si128((const simde__m128i *)src2));
+
+        c_16 = simde_mm256_add_epi16(
+            simde_mm256_add_epi16(simde_mm256_slli_epi16(c_16, 1), c_16),
+            simde_mm256_add_epi16(simde_mm256_slli_epi16(c_16, 1), c_16));
+
+        simde__m256i d_16 = simde_mm256_slli_epi16(
+            simde_mm256_cvtepu8_epi16(
+                simde_mm_loadu_si128((const simde__m128i *)src3)),
+            2);
+
+        simde__m256i e_16 = simde_mm256_cvtepu8_epi16(
+            simde_mm_loadu_si128((const simde__m128i *)src4));
+
+        simde__m256i sum = simde_mm256_add_epi16(
+            a_16, simde_mm256_add_epi16(
+                      b_16, simde_mm256_add_epi16(
+                                c_16, simde_mm256_add_epi16(d_16, e_16))));
+
+        simde__m256i lo_sum =
+            simde_mm256_cvtepi16_epi32(simde_mm256_castsi256_si128(sum));
+        simde__m256i hi_sum =
+            simde_mm256_cvtepi16_epi32(simde_mm256_extracti128_si256(sum, 1));
+
+        simde_mm256_storeu_si256((simde__m256i *)cache, lo_sum);
+        simde_mm256_storeu_si256((simde__m256i *)(cache + 8), hi_sum);
+
+        temp_out[0] = cache[0], temp_out[1] = cache[1], temp_out[2] = cache[2];
+        temp_out[3] = cache[6], temp_out[4] = cache[7], temp_out[5] = cache[8];
+        temp_out[6] = cache[12], temp_out[7] = cache[13],
+        temp_out[8] = cache[14];
+
+        temp_out += (3 * RGB_CHANNELS);
+        src0 += (6 * RGB_CHANNELS), src1 += (6 * RGB_CHANNELS);
+        src2 += (6 * RGB_CHANNELS);
+        src3 += (6 * RGB_CHANNELS), src4 += (6 * RGB_CHANNELS);
+      }
+
+      convolve_1d_3c(x, width, cur_src, src_width, temp_out);
+    }
+
+    yy = y * 2;
+    unsigned char *out_row = dst + (RGB_CHANNELS * width * y);
+
+    int *row0 = temp_dst_rows[0], *row1 = temp_dst_rows[1],
+        *row2 = temp_dst_rows[2], *row3 = temp_dst_rows[3],
+        *row4 = temp_dst_rows[4];
+
+    int xx =
+        convolve_1d_v_simd(0, width * 3, row0, row1, row2, row3, row4, out_row);
+    int x = xx / 3;
+    out_row = out_row + (x * RGB_CHANNELS);
+
+    for (; x < width; ++x) {
+      int xx = x * RGB_CHANNELS;
+      for (int c = 0; c < RGB_CHANNELS; c++) {
+        out_row[0] = clamp((row0[xx + c] + row1[xx + c] * 4 + row2[xx + c] * 6 +
+                            4 * row3[xx + c] + row4[xx + c]) >>
+                               8,
+                           0, 255);
+
+        ++out_row;
+      }
+    }
+
+    rows[0] = rows[2], rows[1] = rows[3], rows[2] = rows[4];
+    rows[3] = src + (reflect_index(((y + 1) * 2) + 1, src_height)) *
+                        (src_width * RGB_CHANNELS);
+    rows[4] = src + (reflect_index(((y + 1) * 2) + 2, src_height)) *
+                        (src_width * RGB_CHANNELS);
+
+    int *temp1 = temp_dst_rows[0], *temp2 = temp_dst_rows[1];
+    temp_dst_rows[0] = temp_dst_rows[2], temp_dst_rows[1] = temp_dst_rows[3],
+    temp_dst_rows[2] = temp_dst_rows[4], temp_dst_rows[3] = temp1,
+    temp_dst_rows[4] = temp2;
+
+    s_y = 1;
+  }
+
+  free(temp_dst_out);
+}
+
+int convolve_1d_1c(int x, int width, unsigned char *cur_src, int src_width,
+                   int *temp_out) {
+  for (; x < width; ++x) {
+    int xx = x * 2;
+    int s0 = cur_src[reflect_index(xx - 2, src_width)];
+    int s1 = cur_src[reflect_index(xx - 1, src_width)];
+    int s2 = cur_src[xx];
+    int s3 = cur_src[reflect_index(xx + 1, src_width)];
+    int s4 = cur_src[reflect_index(xx + 2, src_width)];
+
+    temp_out[0] = s0 + s1 * 4 + s2 * 6 + s3 * 4 + s4;
+    ++temp_out;
+  }
+  return x;
+}
+
+void char_convolve_1(int range_start, int range_end, int src_width,
+                     int src_height, unsigned char *src, unsigned char *dst) {
+
+  int y = range_start;
+  int yy = y * 2;
+  int width = src_width / 2, height = src_height / 2;
+
+  unsigned char *rows[5] = {
+      src + (reflect_index(yy - 2, src_height)) * src_width,
+      src + (reflect_index(yy - 1, src_height)) * src_width,
+      src + yy * src_width,
+      src + (reflect_index(yy + 1, src_height)) * src_width,
+      src + (reflect_index(yy + 2, src_height)) * src_width};
+
+  int *temp_dst_out = (int *)malloc(5 * width * sizeof(int));
+  if (!temp_dst_out)
+    return;
+
+  int *temp_dst_rows[5] = {
+      temp_dst_out, temp_dst_out + width, temp_dst_out + (2 * width),
+      temp_dst_out + (3 * width), temp_dst_out + (4 * width)};
+
+  int s_y = -2;
+  int e_y = 3;
+
+  for (; y < range_end; y++) {
+    for (; s_y < e_y; s_y++) {
+      unsigned char *cur_src = rows[s_y + 2];
+      int *temp_out = temp_dst_rows[s_y + 2];
+      int x = 0;
+      const unsigned char *src01 = cur_src;
+      const unsigned char *src23 = cur_src + 2;
+      const unsigned char *src4 = cur_src + 3;
+
+      const simde__m256i w1_4 = simde_mm256_setr_epi16(1, 4, 1, 4, 1, 4, 1, 4,
+                                                       1, 4, 1, 4, 1, 4, 1, 4);
+      const simde__m256i w6_4 = simde_mm256_setr_epi16(6, 4, 6, 4, 6, 4, 6, 4,
+                                                       6, 4, 6, 4, 6, 4, 6, 4);
+
+      x = convolve_1d_1c(x, 1, cur_src, src_width, temp_out);
+      temp_out = temp_out + x;
+      for (; x < width - 8;
+           x += 8, src01 += 16, src23 += 16, src4 += 16, temp_out += 8) {
+        simde__m256i a01_16 = simde_mm256_cvtepu8_epi16(
+            simde_mm_loadu_si128((const simde__m128i *)src01));
+        simde__m256i m1 = simde_mm256_madd_epi16(a01_16, w1_4);
+
+        simde__m256i a23_16 = simde_mm256_cvtepu8_epi16(
+            simde_mm_loadu_si128((const simde__m128i *)src23));
+        simde__m256i m2 = simde_mm256_madd_epi16(a23_16, w6_4);
+
+        simde__m256i a4_16 = simde_mm256_cvtepu8_epi16(
+            simde_mm_loadu_si128((const simde__m128i *)src4));
+        simde__m256i fifth = simde_mm256_srli_epi32(a4_16, 16);
+
+        simde__m256i sum =
+            simde_mm256_add_epi32(simde_mm256_add_epi32(m1, m2), fifth);
+
+        simde_mm256_storeu_si256((simde__m256i *)temp_out, sum);
+      }
+
+      convolve_1d_1c(x, width, cur_src, src_width, temp_out);
+    }
+
+    yy = y * 2;
+    unsigned char *out_row = dst + (width * y);
+
+    int *row0 = temp_dst_rows[0], *row1 = temp_dst_rows[1],
+        *row2 = temp_dst_rows[2], *row3 = temp_dst_rows[3],
+        *row4 = temp_dst_rows[4];
+
+    int x = convolve_1d_v_simd(0, width, row0, row1, row2, row3, row4, out_row);
+    out_row = out_row + x;
+
+    for (; x < width; ++x) {
+      out_row[0] = clamp(
+          (row0[x] + row1[x] * 4 + row2[x] * 6 + 4 * row3[x] + row4[x]) >> 8, 0,
+          255);
+
+      ++out_row;
+    }
+
+    rows[0] = rows[2], rows[1] = rows[3], rows[2] = rows[4];
+    rows[3] = src + (reflect_index(((y + 1) * 2) + 1, src_height)) * src_width;
+    rows[4] = src + (reflect_index(((y + 1) * 2) + 2, src_height)) * src_width;
+
+    int *temp1 = temp_dst_rows[0], *temp2 = temp_dst_rows[1];
+    temp_dst_rows[0] = temp_dst_rows[2], temp_dst_rows[1] = temp_dst_rows[3],
+    temp_dst_rows[2] = temp_dst_rows[4], temp_dst_rows[3] = temp1,
+    temp_dst_rows[4] = temp2;
+
+    s_y = 1;
+  }
+
+  free(temp_dst_out);
+}
+
 #define DEFINE_DOWNSAMPLE_WORKER_FUNC(NAME, IMAGE_T, PIXEL_T)                  \
   void *NAME(void *args) {                                                     \
     ThreadArgs *arg = (ThreadArgs *)args;                                      \
@@ -83,26 +386,41 @@ Point br(Rect r) {
     IMAGE_T *img = (IMAGE_T *)data->img;                                       \
     int imageSize = image_size(data->img);                                     \
     PIXEL_T *sampled = (PIXEL_T *)data->sampled;                               \
-    for (int y = start_row; y < end_row; ++y) {                                \
-      for (int x = 0; x < data->new_width; ++x) {                              \
-        for (char c = 0; c < img->channels; ++c) {                             \
-          float sum = 0.0;                                                     \
-          for (int i = -2; i < 3; i++) {                                       \
-            for (int j = -2; j < 3; j++) {                                     \
-              int src_row = 2 * y + i;                                         \
-              int src_col = 2 * x + j;                                         \
-              int rr = reflect_index(src_row, img->height);                    \
-              int cc = reflect_index(src_col, img->width);                     \
-              int pos = (cc + rr * img->width) * img->channels + c;            \
-              if (pos < imageSize) {                                           \
-                sum += GAUSSIAN_KERNEL[i + 2][j + 2] * img->data[pos];         \
+    if (data->image_type == IMAGE) {                                           \
+      switch (img->channels) {                                                 \
+      case GRAY_CHANNELS:                                                      \
+        char_convolve_1(start_row, end_row, img->width, img->height,            \
+                        (unsigned char *)img->data, (unsigned char *)sampled); \
+        break;                                                                 \
+      case RGB_CHANNELS:                                                       \
+        char_convolve_3(start_row, end_row, img->width, img->height,            \
+                        (unsigned char *)img->data, (unsigned char *)sampled); \
+        break;                                                                 \
+      default:                                                                 \
+        break;                                                                 \
+      }                                                                        \
+    } else {                                                                   \
+      for (int y = start_row; y < end_row; ++y) {                              \
+        for (int x = 0; x < data->new_width; ++x) {                            \
+          for (char c = 0; c < img->channels; ++c) {                           \
+            float sum = 0.0;                                                   \
+            for (int i = -2; i < 3; i++) {                                     \
+              for (int j = -2; j < 3; j++) {                                   \
+                int src_row = 2 * y + i;                                       \
+                int src_col = 2 * x + j;                                       \
+                int rr = reflect_index(src_row, img->height);                  \
+                int cc = reflect_index(src_col, img->width);                   \
+                int pos = (cc + rr * img->width) * img->channels + c;          \
+                if (pos < imageSize) {                                         \
+                  sum += GAUSSIAN_KERNEL[i + 2][j + 2] * img->data[pos];       \
+                }                                                              \
               }                                                                \
             }                                                                  \
+            if (data->image_type == IMAGE) {                                   \
+              sum = (PIXEL_T)clamp(ceil(sum), 0, 255);                         \
+            }                                                                  \
+            sampled[(y * data->new_width + x) * img->channels + c] = sum;      \
           }                                                                    \
-          if (data->image_type == IMAGE) {                                     \
-            sum = (PIXEL_T)clamp(ceil(sum), 0, 255);                           \
-          }                                                                    \
-          sampled[(y * data->new_width + x) * img->channels + c] = sum;        \
         }                                                                      \
       }                                                                        \
     }                                                                          \
diff --git a/jpeg.c b/jpeg.c
index a1bb256..2d14a3c 100644
--- a/jpeg.c
+++ b/jpeg.c
@@ -9,7 +9,7 @@
 Image decompress_jpeg(const char *filename)
 {
     Image result;
-    result.channels = CHANNELS;
+    result.channels = RGB_CHANNELS;
     tjhandle handle = tjInitDecompress();
     if (!handle)
     {
@@ -346,7 +346,7 @@ void crop_image_buf(Image *img, int cut_top, int cut_bottom, int cut_left, int c
     {
         return;
     }
-    
+
 
     unsigned char *cropped = (unsigned char *)malloc(new_width * new_height * channels);
 
@@ -382,4 +382,3 @@ IMAGE_CONVERT_FUNC(convert_image_to_image_f, Image, ImageF, float)
 IMAGE_CONVERT_FUNC(convert_image_to_image_s, Image, ImageS, short)
 IMAGE_CONVERT_FUNC(convert_imagef_to_image, ImageF, Image, unsigned char)
 IMAGE_CONVERT_FUNC(convert_images_to_image, ImageS, Image, unsigned char)
-
diff --git a/jpeg.h b/jpeg.h
index 6a476fd..44de358 100644
--- a/jpeg.h
+++ b/jpeg.h
@@ -1,6 +1,7 @@
 #ifndef IMAGE_HEADERS
 #define IMAGE_HEADERS
-#define CHANNELS 3
+#define RGB_CHANNELS 3
+#define GRAY_CHANNELS 1
 
 typedef enum {
     BORDER_CONSTANT,
@@ -54,4 +55,4 @@ void convert_image_to_image_f(Image* in , ImageF *out);
 void convert_image_to_image_s(Image* in , ImageS *out);
 void convert_imagef_to_image(ImageF* in , Image *out);
 void convert_images_to_image(ImageS* in , Image *out);
-#endif
\ No newline at end of file
+#endif
diff --git a/laplace_blending.c b/laplace_blending.c
index da4bf1b..2a28ee2 100644
--- a/laplace_blending.c
+++ b/laplace_blending.c
@@ -139,16 +139,16 @@ void *feed_worker(void *args) {
       int outMaskLevelIndex =
           ((i + f->x_tl) + ((k + f->y_tl) * f->out_level_width));
 
-      for (char z = 0; z < CHANNELS; ++z) {
-        int imgIndex = ((i + (k * f->level_width)) * CHANNELS) + z;
+      for (char z = 0; z < RGB_CHANNELS; ++z) {
+        int imgIndex = ((i + (k * f->level_width)) * RGB_CHANNELS) + z;
 
         if (imgIndex < f->img_laplacians[f->level].width *
-                           f->img_laplacians[f->level].height * CHANNELS &&
+                           f->img_laplacians[f->level].height * RGB_CHANNELS &&
             maskIndex < f->mask_gaussian[f->level].width *
                             f->mask_gaussian[f->level].height) {
 
           int outLevelIndex =
-              ((i + f->x_tl) + (k + f->y_tl) * f->out_level_width) * CHANNELS +
+              ((i + f->x_tl) + (k + f->y_tl) * f->out_level_width) * RGB_CHANNELS +
               z;
 
           float maskVal = f->mask_gaussian[f->level].data[maskIndex];
@@ -159,7 +159,7 @@ void *feed_worker(void *args) {
           imgVal = imgVal * maskVal;
 
           if (outLevelIndex <
-                  f->out_level_height * f->out_level_width * CHANNELS &&
+                  f->out_level_height * f->out_level_width * RGB_CHANNELS &&
               outMaskLevelIndex < f->out_level_height * f->out_level_width) {
             f->out[f->level].data[outLevelIndex] += imgVal;
 
@@ -218,7 +218,7 @@ int feed(Blender *b, Image *img, Image *mask_img, Point tl) {
   int bottom = br_new.y - tl.y - img->height;
   int right = br_new.x - tl.x - img->width;
 
-  add_border_to_image(img, top, bottom, left, right, CHANNELS, BORDER_REFLECT);
+  add_border_to_image(img, top, bottom, left, right, RGB_CHANNELS, BORDER_REFLECT);
   add_border_to_image(mask_img, top, bottom, left, right, 1, BORDER_CONSTANT);
 
   images[0] = create_empty_image_s(img->width, img->height, img->channels);
@@ -326,8 +326,8 @@ void *normalize_worker(void *args) {
       if (maskIndex < image_size_f(&n->out_mask[n->level])) {
         float w = n->out_mask[n->level].data[maskIndex];
 
-        for (char z = 0; z < CHANNELS; z++) {
-          int imgIndex = (x + (y * n->output_width)) * CHANNELS + z;
+        for (char z = 0; z < RGB_CHANNELS; z++) {
+          int imgIndex = (x + (y * n->output_width)) * RGB_CHANNELS + z;
           if (imgIndex < image_size_s(&n->final_out[n->level])) {
 
             n->final_out[n->level].data[imgIndex] =
@@ -373,7 +373,7 @@ void blend(Blender *b) {
 
   b->result.data =
       (unsigned char *)malloc(b->output_size.width * b->output_size.height *
-                              CHANNELS * sizeof(unsigned char));
+                              RGB_CHANNELS * sizeof(unsigned char));
   b->result.channels = blended_image.channels;
   b->result.width = blended_image.width;
   b->result.height = blended_image.height;
@@ -385,8 +385,8 @@ void blend(Blender *b) {
       int pos = j + (i * b->result.width);
       float w = b->out_mask[0].data[pos];
       if (w <= WEIGHT_EPS) {
-        int imgPos = (j + (i * b->result.width)) * CHANNELS;
-        for (char c = 0; c < CHANNELS; c++) {
+        int imgPos = (j + (i * b->result.width)) * RGB_CHANNELS;
+        for (char c = 0; c < RGB_CHANNELS; c++) {
           b->result.data[imgPos + c] = 0;
         }
       }
@@ -395,7 +395,7 @@ void blend(Blender *b) {
 
   crop_image_buf(&b->result, 0,
                  max(0, b->result.height - b->real_out_size.height), 0,
-                 max(0, b->result.width - b->real_out_size.width), CHANNELS);
+                 max(0, b->result.width - b->real_out_size.width), RGB_CHANNELS);
   free(blended_image.data);
 }
 
diff --git a/tests/test.c b/tests/test.c
index 57978ea..e5fb1ec 100644
--- a/tests/test.c
+++ b/tests/test.c
@@ -1,86 +1,120 @@
 
-#include <stdio.h>
-#include <time.h>
-#include <string.h>
-#include <stdlib.h>
+
 #include "image_operations.h"
 #include "utils.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
 
-void test_sampling_operations()
-{
-    int data[16] = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160};
-    int expected[4] = {48, 59, 93, 104};
-    int expected_up[16] = {62, 65, 69, 70, 73, 76, 80, 82, 90, 93, 97, 98, 96, 99, 103, 104};
-    Image mask1 = create_empty_image(4, 4, 1);
-    for (int i = 0; i < 16; i++)
-    {
-        mask1.data[i] = data[i];
-    }
+void test_sampling_operations() {
+  int data[16] = {10, 20,  30,  40,
+                  50,  60,  70,  80,
+                  90, 100, 110, 120,
+                  130, 140, 150, 160};
+  int expected[4] = {48, 59, 93, 104};
+  int expected_up[16] = {62, 65, 69, 70, 73, 76, 80,  82,
+                         90, 93, 97, 98, 96, 99, 103, 104};
+  Image mask1 = create_empty_image(4, 4, 1);
+  for (int i = 0; i < 16; i++) {
+    mask1.data[i] = data[i];
+  }
 
-    Image down = downsample(&mask1);
-    if (image_size(&down) != 4)
-    {
-        printf("FATAL Image  size doesn't match expected expected 16 , got (%d)\n", image_size(&down));
-        exit(1);
-    }
-    for (int i = 0; i < 4; i++)
-    {
-        if (down.data[i] != expected[i])
+  Image down = downsample(&mask1);
+  if (image_size(&down) != 4) {
+    printf("FATAL Image  size doesn't match expected expected 16 , got (%d)\n",
+           image_size(&down));
+    exit(1);
+  }
+  for (int i = 0; i < 4; i++) {
+        if (abs(down.data[i] - expected[i]) > 2)
         {
-            printf("FATAL Image  pixel  doesn't match expected (%d) got (%d)\n", expected[i], down.data[i]);
+          printf("FATAL Image  pixel  doesn't match expected (%d) got (%d)\n",
+                 expected[i], down.data[i]);
         }
-    }
+  }
 
-    Image up = upsample(&down, 4.f);
-    if (image_size(&up) != 16)
-    {
-        printf("FATAL Image  size doesn't match expected expected 16 , got (%d)\n", image_size(&down));
-        exit(1);
-    }
-    for (int i = 0; i < 16; i++)
-    {
-        if (abs(up.data[i] - expected_up[i]) > 0)
-        {
-            printf("FATAL Image  pixel  doesn't match expected (%d) got (%d)\n", expected_up[i], up.data[i]);
-            exit(1);
-        }
+  Image up = upsample(&down, 4.f);
+  if (image_size(&up) != 16) {
+    printf("FATAL Image  size doesn't match expected expected 16 , got (%d)\n",
+           image_size(&down));
+    exit(1);
+  }
+  for (int i = 0; i < 16; i++) {
+    if (abs(up.data[i] - expected_up[i]) > 2) {
+      printf("FATAL Image  pixel  doesn't match expected (%d) got (%d)\n",
+             expected_up[i], up.data[i]);
+      exit(1);
     }
+  }
 
-    int data2[48] = {249, 144, 1, 251, 146, 3, 254, 149, 6, 255, 152, 9, 250, 145, 2, 252, 147, 4, 254, 149,
-                     6, 255, 151, 8, 252, 147, 4, 252, 147, 4, 253, 148, 5, 254, 149, 6, 253, 148, 5, 253, 148, 5, 253, 148, 5, 253, 148, 5};
-    Image rgb_image = create_empty_image(4, 4, CHANNELS);
-    for (int i = 0; i < image_size(&rgb_image); i++)
-    {
-        rgb_image.data[i] = data2[i];
-    }
+  int data2[48] = {249, 144, 1, 251, 146, 3, 254, 149, 6, 255, 152, 9,
+                   250, 145, 2, 252, 147, 4, 254, 149, 6, 255, 151, 8,
+                   252, 147, 4, 252, 147, 4, 253, 148, 5, 254, 149, 6,
+                   253, 148, 5, 253, 148, 5, 253, 148, 5, 253, 148, 5};
+  Image rgb_image = create_empty_image(4, 4, RGB_CHANNELS);
+  for (int i = 0; i < image_size(&rgb_image); i++) {
+    rgb_image.data[i] = data2[i];
+  }
+
+  unsigned char input_uc[4 * 4 * 3] = {
+      10,  10,  10,  20,  20,  20,  30,  30,  30,  40,  40,  40,
+      50,  50,  50,  60,  60,  60,  70,  70,  70,  80,  80,  80,
+      90,  90,  90,  100, 100, 100, 110, 110, 110, 120, 120, 120,
+      130, 130, 130, 140, 140, 140, 150, 150, 150, 160, 160, 160};
+
+  for (int i = 0; i < image_size(&rgb_image); i++) {
+    rgb_image.data[i] = input_uc[i];
+  }
 
-    destroy_image(&mask1);
-    destroy_image(&down);
-    destroy_image(&rgb_image);
+  Image out = downsample(&rgb_image);
+
+  for (int i = 0; i < 2 * 2 * 3; i++) {
+    printf("%d ", out.data[i]);
+  }
+  printf("\n");
+
+
+  destroy_image(&out);
+  destroy_image(&mask1);
+  destroy_image(&down);
+  destroy_image(&rgb_image);
 }
 
-int main()
-{
+int main() {
+
+  Image img_buf1 = create_image("../files/apple.jpeg");
+  Image mask = convert_RGB_to_gray(&img_buf1);
+
+  Image dm = downsample(&mask);
+  save_image(&dm, "testm1.jpg");
+
+  ImageS imgs =
+      create_empty_image_s(img_buf1.width, img_buf1.height, img_buf1.channels);
 
-    Image img_buf1 = create_image("../files/apple.jpeg");
-    Image d = downsample(&img_buf1);
-    save_image(&d, "test1.jpg");
+  convert_image_to_image_s(&img_buf1, &imgs);
+  ImageS ds = downsample_s(&imgs);
+  Image ds1 = create_empty_image(ds.width, ds.height, ds.channels);
+  convert_images_to_image(&imgs, &img_buf1);
+  convert_images_to_image(&ds, &ds1);
 
-    ImageS imgs = create_empty_image_s(img_buf1.width, img_buf1.height, img_buf1.channels);
+  save_image(&ds1, "test2.jpg");
 
-    convert_image_to_image_s(&img_buf1, &imgs);
-    ImageS ds = downsample_s(&imgs);
-    Image ds1 = create_empty_image(ds.width, ds.height, ds.channels);
-    convert_images_to_image(&imgs, &img_buf1);
-    convert_images_to_image(&ds, &ds1);
+  test_sampling_operations();
 
-    save_image(&ds1, "test2.jpg");
+  destroy_image(&img_buf1);
+  destroy_image_s(&imgs);
+  destroy_image_s(&ds);
+  destroy_image(&dm);
 
-    test_sampling_operations();
+  Image bbbbb = create_image("/Users/abrahamakerele/p-h/lendostuff/lendo-image-processor/files/1.jpg");
+  Image mask_bbbbb = convert_RGB_to_gray(&bbbbb);
+  Image d = downsample(&bbbbb);
+  printf("%d %d %d\n",d.width,d.height,d.channels);
+  save_image(&d, "test1.jpg");
 
-    destroy_image(&img_buf1);
-    destroy_image_s(&imgs);
-    destroy_image_s(&ds);
+  destroy_image(&bbbbb);
+  destroy_image(&mask_bbbbb);
 
-    return 0;
+  return 0;
 }
diff --git a/utils.c b/utils.c
index c6c9db7..928d2ed 100644
--- a/utils.c
+++ b/utils.c
@@ -7,15 +7,16 @@
 
 int get_cpus_count()
 {
-#if defined(_WIN32) || defined(_WIN64)
-    SYSTEM_INFO sysinfo;
-    GetSystemInfo(&sysinfo);
-    return sysinfo.dwNumberOfProcessors;
-#elif defined(_SC_NPROCESSORS_ONLN)
-    return (int)sysconf(_SC_NPROCESSORS_ONLN);
-#else
-    return 1;
-#endif
+// #if defined(_WIN32) || defined(_WIN64)
+//     SYSTEM_INFO sysinfo;
+//     GetSystemInfo(&sysinfo);
+//     return sysinfo.dwNumberOfProcessors;
+// #elif defined(_SC_NPROCESSORS_ONLN)
+//     return (int)sysconf(_SC_NPROCESSORS_ONLN);
+// #else
+//     return 1;
+// #endif
+return 1;
 }
 
 int clamp(int value, int min, int max)
@@ -48,7 +49,7 @@ int max(int a, int b)
 
 int reflect_index(int i,int n){
     if (i < 0)
-        return  -i;
+        return  -i % n;
     else if (i >= n)
         return 2 * n - i - 2;
     else

From dfb7cf675687177e8983fe293c94b98010181995 Mon Sep 17 00:00:00 2001
From: Akerele Abraham <abrahamakerele38@gmail.com>
Date: Mon, 26 May 2025 12:15:27 +0100
Subject: [PATCH 4/5] remove recursive  git modules

---
 .github/workflows/main.yml |  2 --
 utils.c                    | 19 +++++++++----------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f80d0c5..232e480 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -14,8 +14,6 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
-        with:
-          submodules: true
 
       - name: Install dependencies (Linux)
         if: runner.os == 'Linux'
diff --git a/utils.c b/utils.c
index 928d2ed..51d5b36 100644
--- a/utils.c
+++ b/utils.c
@@ -7,16 +7,15 @@
 
 int get_cpus_count()
 {
-// #if defined(_WIN32) || defined(_WIN64)
-//     SYSTEM_INFO sysinfo;
-//     GetSystemInfo(&sysinfo);
-//     return sysinfo.dwNumberOfProcessors;
-// #elif defined(_SC_NPROCESSORS_ONLN)
-//     return (int)sysconf(_SC_NPROCESSORS_ONLN);
-// #else
-//     return 1;
-// #endif
-return 1;
+#if defined(_WIN32) || defined(_WIN64)
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+#elif defined(_SC_NPROCESSORS_ONLN)
+    return (int)sysconf(_SC_NPROCESSORS_ONLN);
+#else
+    return 1;
+#endif
 }
 
 int clamp(int value, int min, int max)

From 41d0017753186e93e6ac161cb6f79f9d572202dc Mon Sep 17 00:00:00 2001
From: Akerele Abraham <abrahamakerele38@gmail.com>
Date: Mon, 26 May 2025 13:03:27 +0100
Subject: [PATCH 5/5] Debugging github CI

---
 .github/workflows/main.yml |  4 +++-
 build.sh                   | 23 +++++++++++++----------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 232e480..82e5f43 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -14,6 +14,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
+        with:
+          submodules: true
 
       - name: Install dependencies (Linux)
         if: runner.os == 'Linux'
@@ -31,4 +33,4 @@ jobs:
         run: brew install gcc make
 
       - name: Build
-        run: git clone https://github.com/libjpeg-turbo/libjpeg-turbo.git && ./build.sh  build macos
+        run: ./build.sh  build macos
diff --git a/build.sh b/build.sh
index 8935673..e88bc0a 100755
--- a/build.sh
+++ b/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -e 
+set -e
 BUILD_DIR_LIB_TURBOJPEG=$PWD/builds/libturbojpeg
 INSTALL_DIR_LIB_TURBOJPEG=$PWD/installs/libturbojpeg
 BUILD_DIR_LIB_NATIVE_STITCHER=$PWD/builds/native-stitcher
@@ -34,6 +34,7 @@ build_macos() {
       -DCMAKE_SYSTEM_PROCESSOR=$ARCH \
       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
       -DCMAKE_C_COMPILER=$(xcrun --sdk macosx --find clang) \
+      -DCMAKE_OSX_SYSROOT=$(xcrun --sdk macosx --show-sdk-path) \
       -DCMAKE_C_FLAGS="$TARGET_FLAG"
 
     make -j$(sysctl -n hw.logicalcpu)
@@ -52,10 +53,12 @@ build_macos() {
         -DCMAKE_SYSTEM_PROCESSOR=$ARCH \
         -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
         -DCMAKE_C_COMPILER=$(xcrun --sdk macosx --find clang) \
+        -DCMAKE_OSX_ARCHITECTURES=$ARCH \
+        -DCMAKE_OSX_SYSROOT=$(xcrun --sdk macosx --show-sdk-path) \
         -DCMAKE_C_FLAGS="$TARGET_FLAG"
       make -j$(sysctl -n hw.logicalcpu)
       make install
-    popd    
+    popd
   done
 }
 
@@ -65,7 +68,7 @@ build_android() {
     BUILD_DIR="$BUILD_DIR_LIB_TURBOJPEG/andriod/$ARCH"
     INSTALL_DIR="$INSTALL_DIR_LIB_TURBOJPEG/andriod/$ARCH"
     mkdir -p "$BUILD_DIR"
-    
+
 
     if [ "$ARCH" == "arm64-v8a" ]; then
       TARGET_FLAG="-target aarch64-linux-android21"
@@ -120,7 +123,7 @@ build_android() {
         -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
         -DCMAKE_C_COMPILER=$ANDROID_NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/clang \
         -DCMAKE_C_FLAGS="$TARGET_FLAG" \
-        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake 
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake
       make -j$(sysctl -n hw.logicalcpu)
       make install
     popd
@@ -148,7 +151,7 @@ build_ios() {
         -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
         -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
         -DCMAKE_C_COMPILER=$(xcrun --sdk iphoneos --find clang) \
-        -DCMAKE_C_FLAGS="$TARGET_FLAG" 
+        -DCMAKE_C_FLAGS="$TARGET_FLAG"
 
       make -j$(sysctl -n hw.logicalcpu)
       make install
@@ -167,7 +170,7 @@ build_ios() {
         -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
         -DCMAKE_C_FLAGS="-target arm64-apple-ios12.0" \
         -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-        -DCMAKE_C_COMPILER=$(xcrun --sdk iphoneos --find clang) 
+        -DCMAKE_C_COMPILER=$(xcrun --sdk iphoneos --find clang)
       make -j$(sysctl -n hw.logicalcpu)
       make install
     popd
@@ -193,7 +196,7 @@ build_ios_sim() {
           -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
           -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
           -DCMAKE_C_COMPILER=$(xcrun --sdk iphoneos --find clang) \
-          -DCMAKE_C_FLAGS="$TARGET_FLAG" 
+          -DCMAKE_C_FLAGS="$TARGET_FLAG"
 
         make -j$(sysctl -n hw.logicalcpu)
         make install
@@ -212,7 +215,7 @@ build_ios_sim() {
           -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
           -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
           -DCMAKE_C_FLAGS="-target $ARCH-apple-ios14.0-simulator" \
-          -DCMAKE_C_COMPILER=$(xcrun --sdk iphonesimulator --find clang) 
+          -DCMAKE_C_COMPILER=$(xcrun --sdk iphonesimulator --find clang)
         make -j$(sysctl -n hw.logicalcpu)
         make install
       popd
@@ -269,7 +272,7 @@ case "$1" in
             ;;
         ios-sim)
             build_ios_sim
-            ;;              
+            ;;
         *)
         help
         ;;
@@ -285,4 +288,4 @@ esac
 
 
 # gcc-14 -I../ -I/usr/local/include -L/usr/local/lib -lturbojpeg -pthread -fsanitize=address -g -o downsampled ../laplace_blending.c ../jpeg.c ../utils.c downsampling.c && ./downsampled
-# gcc-14 -I../ -I/usr/local/include -L/usr/local/lib -lturbojpeg -pthread -fsanitize=address -g -o stitch  ../laplace_blending.c ../jpeg.c ../utils.c stitch.c && ./stitch
\ No newline at end of file
+# gcc-14 -I../ -I/usr/local/include -L/usr/local/lib -lturbojpeg -pthread -fsanitize=address -g -o stitch  ../laplace_blending.c ../jpeg.c ../utils.c stitch.c && ./stitch