diff --git a/Project.toml b/Project.toml
index 9bbdb12..133fb48 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "JetPackTransforms"
 uuid = "e0630bc5-6e1f-509c-a3e4-0aea24bd2b1c"
-version = "0.4.0"
+version = "0.4.1"
 
 [deps]
 DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2"
diff --git a/docs/src/slantstack.md b/docs/src/slantstack.md
index 3368a87..48b95ef 100644
--- a/docs/src/slantstack.md
+++ b/docs/src/slantstack.md
@@ -44,3 +44,14 @@ Thus, the mapping between ``k_h`` and ``\theta`` is given by
 ```math
 d(k_z,\theta) = \delta(k_h + k_z \tan(\theta))d(k_z,k_h).
 ```
+
+In 3D, the mapping from ``k_{hx},k_{hy}`` to reflection angle ``\theta`` at a given azimuth ``\phi`` depends on the geologic dip defined by the dip ``\alpha`` and azimuth ``\beta``. Without further derivations, it can be shown that this mapping is given by
+
+```math
+d(k_z,\theta,\phi) = \delta(k_{hx} + \xi\, k_z \tan(\theta)\cos(\phi))\delta(k_{hy} + \xi\,k_z \tan(\theta)\sin(\phi))d(k_z,k_{hx},k_{hy}),
+```
+where ``\xi`` is a correction factor given by
+
+```math
+\xi = \sqrt(\frac{1 + \tan^2(\alpha)}{1 + \tan^2(\alpha)\cos^2(\beta - \phi)}).
+```
\ No newline at end of file
diff --git a/src/jop_slantstack.jl b/src/jop_slantstack.jl
index 222d9bf..a2790d2 100644
--- a/src/jop_slantstack.jl
+++ b/src/jop_slantstack.jl
@@ -3,7 +3,7 @@
 
 where `A` is the 2D slant-stack operator mapping for `z-h` to `z-θ` (depth mode) or `t-h` to `tau-p` (time mode).
 The domain of the operator is `nz` x `nh` with precision T, `dz` is the depth spacing (or time interval),
-`dh` is the offset spacing, and `h0` is the origin of the offset axis.  The additional named optional arguments
+`dh` is the (half) offset spacing, and `h0` is the origin of the (half) offset axis.  The additional named optional arguments
 along with their default values are,
 
 * `mode="depth` - choose between "depth" and "time" to specify if the input domain is `z-h` or `t-h`
@@ -146,6 +146,9 @@ end
 @inline function slantstack_compute_kh_from_kz(ikz::Int64, ip::Int64, tant, p, kz, kh, nhfft)
     _kh = -kz[ikz]*tant[ip]
 
+    # check if wavenumber is outside Nyquist
+    ((_kh > kh[div(nhfft+1,2)]) || (_kh < kh[div(nhfft+1,2)+1])) && return -1, -1, _kh
+
     ikh_m1 = floor(Int64, _kh/kh[2]) + 1
     ikh_p1 = ceil(Int64, _kh/kh[2]) + 1
 
@@ -158,6 +161,9 @@ end
 @inline function slantstack_compute_kh_from_frequency(iω::Int64, ip::Int64, tant, p, ω, kh, nhfft)
     _kh = p[ip]*ω[iω]
 
+    # check if wavenumber is outside Nyquist
+    ((_kh > kh[div(nhfft+1,2)]) || (_kh < kh[div(nhfft+1,2)+1])) && return -1, -1, _kh
+
     ikh_m1 = floor(Int64, _kh/kh[2]) + 1
     ikh_p1 = ceil(Int64, _kh/kh[2]) + 1
 
@@ -172,7 +178,7 @@ end
 
 where `A` is the 3D slant-stack operator mapping for `z-hx-hy` to `z-θ-ϕ` (depth mode) or `t-hx-hy` to `tau-px-py` (time mode).
 The domain of the operator is typically `nz` x `nhx` x `nhy` with precision T, `dz` is the depth spacing (or time interval),
-`dhx` and `dhy` are the offset spacings, and `hx0` and `hy0` are the origins of the offset axes.
+`dhx` and `dhy` are the (half) offset spacings, and `hx0` and `hy0` are the origins of the (half) offset axes.
 The domain can also be `nz` x `ny` x `nx` x `nhx` x `nhy` where the `y` and `x` dimensions are passive.
 The additional named optional arguments along with their default values are,
 
@@ -301,15 +307,14 @@ function JopSlantStack3D_df!(d::AbstractArray{T,3}, m::AbstractArray{T,3}; mode,
     end
 
     is_out_of_bounds = (ikhx_m1, ikhx_p1, ikhy_m1, ikhy_p1)->(ikhx_m1 < 1 || ikhx_p1 > nhxfft || ikhy_m1 < 1 || ikhy_p1 > nhyfft)
-
+    
     D = zeros(eltype(M), size(M,1), npx, npy)
     @threads for ip = 1:npx*npy
-        ipx = div(ip-1, npy) + 1
-        ipy = mod(ip-1, npy) + 1
-        num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-        denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
         for ikz = 1:div(nzfft,2)+1
-            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, num, denom)
+            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, factor)
 
             is_out_of_bounds(ikhx_m1, ikhx_p1, ikhy_m1, ikhy_p1) && continue
 
@@ -356,12 +361,11 @@ function JopSlantStack3D_df!(d::AbstractArray{T,5}, m::AbstractArray{T,5}; mode,
 
     D = zeros(eltype(M), size(M,1), ny, nx, npx, npy)
     @threads for ip = 1:npx*npy
-        ipx = div(ip-1, npy) + 1
-        ipy = mod(ip-1, npy) + 1
-        num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-        denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
         for ikz = 1:div(nzfft,2)+1
-            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, num, denom)
+            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, factor)
 
             is_out_of_bounds(ikhx_m1, ikhx_p1, ikhy_m1, ikhy_p1) && continue
 
@@ -407,12 +411,11 @@ function JopSlantStack3D_df′!(m::AbstractArray{T,3}, d::AbstractArray{T,3}; mo
 
     M = zeros(Complex{T}, div(nzfft,2)+1, nhxfft, nhyfft)
     for ip = 1:npx*npy
-        ipx = div(ip-1, npy) + 1
-        ipy = mod(ip-1, npy) + 1
-        num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-        denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
         for ikz = 1:div(nzfft,2)+1
-            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, num, denom)
+            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, factor)
 
             is_out_of_bounds(ikhx_m1, ikhx_p1, ikhy_m1, ikhy_p1) && continue
 
@@ -462,12 +465,11 @@ function JopSlantStack3D_df′!(m::AbstractArray{T,5}, d::AbstractArray{T,5}; mo
 
     M = zeros(Complex{T}, div(nzfft,2)+1, ny, nx, nhxfft, nhyfft)
     for ip = 1:npx*npy
-        ipx = div(ip-1, npy) + 1
-        ipy = mod(ip-1, npy) + 1
-        num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-        denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
         for ikz = 1:div(nzfft,2)+1
-            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, num, denom)
+            ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy = compute_kh(ikz, ipx, ipy, px, py, kz, khx, khy, nhxfft, nhyfft, factor)
 
             is_out_of_bounds(ikhx_m1, ikhx_p1, ikhy_m1, ikhy_p1) && continue
 
@@ -491,10 +493,13 @@ function JopSlantStack3D_df′!(m::AbstractArray{T,5}, d::AbstractArray{T,5}; mo
     m .= TX * (brfft(TK * M, nzfft, (1,4,5))[1:nz,:,:,1:nhx,1:nhy])
 end
 
-@inline function slantstack_khxy_from_kz(ikz::Int64, ipx::Int64, ipy::Int64, px, py, kz, khx, khy, nhxfft, nhyfft, num, denom)
+@inline function slantstack_khxy_from_kz(ikz::Int64, ipx::Int64, ipy::Int64, px, py, kz, khx, khy, nhxfft, nhyfft, factor)
     _khx = kz[ikz]*px[ipx]*cos(py[ipy])
     _khy = kz[ikz]*px[ipx]*sin(py[ipy])
 
+    # check if wavenumber is outside Nyquist
+    ((_khx > khx[div(nhxfft+1,2)]) || (_khx < khx[div(nhxfft+1,2)+1]) || (_khy > khy[div(nhyfft+1,2)]) || (_khy < khy[div(nhyfft+1,2)+1])) && return -1, -1, _khx, -1, -1, _khy
+
     ikhx_m1 = floor(Int64, _khx/khx[2]) + 1
     ikhx_p1 = ceil(Int64, _khx/khx[2]) + 1
     ikhy_m1 = floor(Int64, _khy/khy[2]) + 1
@@ -508,9 +513,12 @@ end
     ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy
 end
 
-@inline function slantstack_khxy_from_kz_geologic(ikz::Int64, ipx::Int64, ipy::Int64, px, py, kz, khx, khy, nhxfft, nhyfft, num, denom)
-    _khx = kz[ikz]*px[ipx]*(cos(py[ipy]) + sin(py[ipy])*num)/denom
-    _khy = kz[ikz]*px[ipx]*(sin(py[ipy]) - cos(py[ipy])*num)/denom
+@inline function slantstack_khxy_from_kz_geologic(ikz::Int64, ipx::Int64, ipy::Int64, px, py, kz, khx, khy, nhxfft, nhyfft, factor)
+    _khx = kz[ikz]*px[ipx]*cos(py[ipy])*factor
+    _khy = kz[ikz]*px[ipx]*sin(py[ipy])*factor
+
+    # check if wavenumber is outside Nyquist
+    ((_khx > khx[div(nhxfft+1,2)]) || (_khx < khx[div(nhxfft+1,2)+1]) || (_khy > khy[div(nhyfft+1,2)]) || (_khy < khy[div(nhyfft+1,2)+1])) && return -1, -1, _khx, -1, -1, _khy
 
     ikhx_m1 = floor(Int64, _khx/khx[2]) + 1
     ikhx_p1 = ceil(Int64, _khx/khx[2]) + 1
@@ -525,10 +533,13 @@ end
     ikhx_m1, ikhx_p1, _khx, ikhy_m1, ikhy_p1, _khy
 end
 
-@inline function slantstack_khxy_from_frequency(iω::Int64, ipx::Int64, ipy::Int64, px, py, ω, khx, khy, nhxfft, nhyfft, num, denom)
+@inline function slantstack_khxy_from_frequency(iω::Int64, ipx::Int64, ipy::Int64, px, py, ω, khx, khy, nhxfft, nhyfft, factor)
     _khx = px[ipx]*ω[iω]
     _khy = py[ipy]*ω[iω]
 
+    # check if wavenumber is outside Nyquist
+    ((_khx > khx[div(nhxfft+1,2)]) || (_khx < khx[div(nhxfft+1,2)+1]) || (_khy > khy[div(nhyfft+1,2)]) || (_khy < khy[div(nhyfft+1,2)+1])) && return -1, -1, _khx, -1, -1, _khy
+
     ikhx_m1 = floor(Int64, _khx/khx[2]) + 1
     ikhx_p1 = ceil(Int64, _khx/khx[2]) + 1
     ikhy_m1 = floor(Int64, _khy/khy[2]) + 1
@@ -548,7 +559,7 @@ end
 where `A` is the 2D slant-stack operator mapping for `z-h` to `z-θ` (depth mode) or `t-h` to `tau-p` (time mode).
 The slant stacking is performed directly on the input domain by shifting and summing along the offset axis.
 The domain of the operator is `nz` x `nh` with precision T, `dz` is the depth spacing (or time interval),
-`h`, if provided, is the array of offsets (can be irregular). If not provided, a regular grid is assumed with same sampling as dz.
+`h`, if provided, is the array of (half) offsets (can be irregular). If not provided, a regular grid is assumed with same sampling as dz.
 The additional named optional arguments along with their default values are,
 
 * `mode="depth` - choose between "depth" and "time" to specify if the input domain is `z-h` or `t-h`
@@ -605,13 +616,13 @@ function JopSlantStackShiftSum_df!(d::AbstractArray{T,2}, m::AbstractArray{T,2};
 
     d .= 0
     mtap = TZ * m
+    holder = zeros(T, nz, Threads.maxthreadid())
     @threads for ip = 1:np
-        holder = zeros(T, nz)
         for ih = 1:nh
             shift = + h[ih] * p[ip] / dz
             if abs(shift) < nz
-                holder=_shift_forward(@view(mtap[:,ih]), shift)
-                d[:,ip] .+= holder
+                _shift_forward!(@view(holder[:,Threads.threadid()]), @view(mtap[:,ih]), shift)
+                @views d[:,ip] .+= holder[:,Threads.threadid()]
             end
         end
     end
@@ -622,13 +633,13 @@ function JopSlantStackShiftSum_df′!(m::AbstractArray{T,2}, d::AbstractArray{T,
     nz, nh, np = size(m,1), size(m,2), length(p)
 
     mtap = zeros(T, nz, nh)
+    holder = zeros(T, nz, Threads.maxthreadid())
     @threads for ih = 1:nh
-        holder = zeros(T, nz)
         for ip = 1:np
             shift = + h[ih] * p[ip] / dz
             if abs(shift) < nz
-                holder=_shift_adjoint(@view(d[:,ip]), shift)
-                @views mtap[:,ih] .+= holder
+                _shift_adjoint!(@view(holder[:,Threads.threadid()]), @view(d[:,ip]), shift)
+                @views mtap[:,ih] .+= holder[:,Threads.threadid()]
             end
         end
     end
@@ -645,7 +656,7 @@ end
 where `A` is the 3D slant-stack operator mapping for `z-hx-hy` to `z-θ-ϕ` (depth mode) or `t-hx-hy` to `tau-px-py` (time mode).
 The slant stacking is performed directly on the input domain by shifting and summing along the offset axis.
 The domain of the operator is `nz` x `nh` (for irregular) or `nz` x `nhx` x `nhy` (for regular) with precision T, `dz` is the depth spacing (or time interval),
-`hx` and `hy`, if provided, are the arrays of offsets (can be irregular). If not provided, a regular grid is assumed with same sampling as dz.
+`hx` and `hy`, if provided, are the arrays of (half) offsets (can be irregular). If not provided, a regular grid is assumed with same sampling as dz.
 The additional named optional arguments along with their default values are,
 
 * `mode="depth` - choose between "depth" and "time" to specify if the input domain is `z-hx-hy` or `t-hx-hy`
@@ -782,17 +793,16 @@ function JopSlantStackShiftSum3D_df_scalar!(d::AbstractArray{T,3}, m::AbstractAr
 
     d .= 0
     mtap = TZ * m
-    @threads for ipx = 1:npx
-        holder = zeros(T, nz)
-        for ipy = 1:npy
-            num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-            denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
-            for ih = 1:nh
-                shift = compute_shift(1, ih, ih, ipx, ipy, hx, hy, px, py, dz, num, denom)
-                if abs(shift) < nz
-                    holder=_shift_forward(@view(mtap[:,ih]), shift)
-                    d[:,ipx,ipy] .+= holder
-                end
+    holder = zeros(T, nz, Threads.maxthreadid())
+    @threads for ip = 1:npx*npy
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
+        for ih = 1:nh
+            shift = compute_shift(1, ih, ih, ipx, ipy, hx, hy, px, py, dz, factor)
+            if abs(shift) < nz
+                _shift_forward!(@view(holder[:,Threads.threadid()]), @view(mtap[:,ih]), shift)
+                @views d[:,ipx,ipy] .+= holder[:,Threads.threadid()]
             end
         end
     end
@@ -812,18 +822,17 @@ function JopSlantStackShiftSum3D_df_scalar!(d::AbstractArray{T,3}, m::AbstractAr
 
     d .= 0
     mtap = TZ * m
-    @threads for ipx = 1:npx
-        holder = zeros(T, nz)
-        for ipy = 1:npy
-            num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-            denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
-            for ihx = 1:nhx
-                for ihy = 1:nhy
-                    shift = compute_shift(1, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, num, denom)
-                    if abs(shift) < nz
-                        holder=_shift_forward(@view(mtap[:,ihx,ihy]), shift)
-                        d[:,ipx,ipy] .+= holder
-                    end
+    holder = zeros(T, nz, Threads.maxthreadid())
+    @threads for ip = 1:npx*npy
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
+        for ihx = 1:nhx
+            for ihy = 1:nhy
+                shift = compute_shift(1, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, factor)
+                if abs(shift) < nz
+                    _shift_forward!(@view(holder[:,Threads.threadid()]), @view(mtap[:,ihx,ihy]), shift)
+                    @views d[:,ipx,ipy] .+= holder[:,Threads.threadid()]
                 end
             end
         end
@@ -838,18 +847,17 @@ function JopSlantStackShiftSum3D_df_vector!(d::AbstractArray{T,3}, m::AbstractAr
     
     d .= 0
     mtap = TZ * m
-    @threads for ipx = 1:npx
-        holder = zeros(T, nz)
-        for ipy = 1:npy
-            num = tan.(dip).^2 .* sin.(azimuth .- py[ipy]) .* cos.(azimuth .- py[ipy])
-            denom = sqrt.(1 .+ (tan.(dip) .* sin.(azimuth .- py[ipy])).^2)
-            for ih = 1:nh
-                for iz = 1:nz
-                    shift = compute_shift(iz, ih, ih, ipx, ipy, hx, hy, px, py, dz, num, denom)
-                    if abs(shift) < nz 
-                        holder=_shift_forward(@view(mtap[:,ih]), shift)
-                        d[iz,ipx,ipy] += holder[iz]
-                    end
+    @threads for ip = 1:npx*npy
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt.( (1 .+ tan.(dip).^2) ./ (1 .+ tan.(dip).^2 .* cos.(azimuth .- py[ipy]).^2) )
+        for ih = 1:nh
+            for iz = 1:nz
+                shift = compute_shift(iz, ih, ih, ipx, ipy, hx, hy, px, py, dz, factor)
+                if abs(shift) < nz 
+                    # holder=_shift_forward(@view(mtap[:,ih]), shift)
+                    # d[iz,ipx,ipy] += holder[iz]
+                    d[iz,ipx,ipy] += _interpolate_forward(@view(mtap[:,ih]), iz - shift)
                 end
             end
         end
@@ -864,19 +872,16 @@ function JopSlantStackShiftSum3D_df_vector!(d::AbstractArray{T,3}, m::AbstractAr
     
     d .= 0
     mtap = TZ * m
-    @threads for ipx = 1:npx
-        holder = zeros(T, nz)
-        for ipy = 1:npy
-            num = tan.(dip).^2 .* sin.(azimuth .- py[ipy]) .* cos.(azimuth .- py[ipy])
-            denom = sqrt.(1 .+ (tan.(dip) .* sin.(azimuth .- py[ipy])).^2)
-            for ihx = 1:nhx
-                for ihy = 1:nhy
-                    for iz = 1:nz
-                        shift = compute_shift(iz, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, num, denom)
-                        if abs(shift) < nz 
-                            holder=_shift_forward(@view(mtap[:,ihx,ihy]), shift)
-                            d[iz,ipx,ipy] += holder[iz]
-                        end
+    @threads for ip = 1:npx*npy
+        ipy = div(ip-1, npx) + 1
+        ipx = mod(ip-1, npx) + 1
+        factor = sqrt.( (1 .+ tan.(dip).^2) ./ (1 .+ tan.(dip).^2 .* cos.(azimuth .- py[ipy]).^2) )
+        for ihx = 1:nhx
+            for ihy = 1:nhy
+                for iz = 1:nz
+                    shift = compute_shift(iz, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, factor)
+                    if abs(shift) < nz 
+                        d[iz,ipx,ipy] += _interpolate_forward(@view(mtap[:,ihx,ihy]), iz - shift)
                     end
                 end
             end
@@ -897,21 +902,21 @@ function JopSlantStackShiftSum3D_df_scalar′!(m::AbstractArray{T,2}, d::Abstrac
     end
 
     mtap = zeros(T, nz, nh)
+    acc = zeros(T, nz, Threads.maxthreadid())
+    holder = zeros(T, nz, Threads.maxthreadid())
     @threads for ih = 1:nh
-        acc = zeros(T, nz)
-        holder = zeros(T, nz)
+        fill!(@view(acc[:, Threads.threadid()]), zero(T))
         for ipx = 1:npx
             for ipy = 1:npy
-                num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-                denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
-                shift = compute_shift(1, ih, ih, ipx, ipy, hx, hy, px, py, dz, num, denom)
+                factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
+                shift = compute_shift(1, ih, ih, ipx, ipy, hx, hy, px, py, dz, factor)
                 if abs(shift) < nz
-                    holder=_shift_adjoint(@view(d[:,ipx,ipy]), shift)
-                    acc .+= holder
+                    _shift_adjoint!(@view(holder[:,Threads.threadid()]), @view(d[:,ipx,ipy]), shift)
+                    @views acc[:, Threads.threadid()] .+= holder[:,Threads.threadid()]
                 end
             end
         end
-        mtap[:,ih] .= acc
+        @views mtap[:,ih] .= acc[:, Threads.threadid()]
     end
     m = TZ' * mtap
     m
@@ -929,23 +934,23 @@ function JopSlantStackShiftSum3D_df_scalar′!(m::AbstractArray{T,3}, d::Abstrac
     end
 
     mtap = zeros(T, nz, nhx, nhy)
-    @threads for ihx = 1:nhx
-        for ihy = 1:nhy
-            acc = zeros(T, nz)
-            holder = zeros(T, nz)
-            for ipx = 1:npx
-                for ipy = 1:npy
-                    num = tan(dip)^2 *sin(azimuth - py[ipy])*cos(azimuth - py[ipy])
-                    denom = sqrt(1 + (tan(dip)*sin(azimuth - py[ipy]))^2)
-                    shift = compute_shift(1, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, num, denom)
-                    if abs(shift) < nz
-                        holder=_shift_adjoint(@view(d[:,ipx,ipy]), shift)
-                        acc .+= holder
-                    end
+    acc = zeros(T, nz, Threads.maxthreadid())
+    holder = zeros(T, nz, Threads.maxthreadid())
+    @threads for ih = 1:nhx*nhy
+        ihy = div(ih-1, nhx) + 1
+        ihx = mod(ih-1, nhx) + 1
+        fill!(@view(acc[:, Threads.threadid()]), zero(T))
+        for ipx = 1:npx
+            for ipy = 1:npy
+                factor = sqrt( (1 + tan(dip)^2) / (1 + tan(dip)^2 * cos(azimuth - py[ipy])^2) )
+                shift = compute_shift(1, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, factor)
+                if abs(shift) < nz
+                    _shift_adjoint!(@view(holder[:, Threads.threadid()]), @view(d[:,ipx,ipy]), shift)
+                    @views acc[:, Threads.threadid()] .+= holder[:, Threads.threadid()]
                 end
             end
-            mtap[:,ihx,ihy] .= acc
         end
+        @views mtap[:,ihx,ihy] .= acc[:, Threads.threadid()]
     end
     m = TZ' * mtap
     m
@@ -958,19 +963,13 @@ function JopSlantStackShiftSum3D_df_vector′!(m::AbstractArray{T,2}, d::Abstrac
 
     mtap = zeros(T, nz, nh)
     @threads for ih = 1:nh
-        acc = zeros(T, nz)
-        holder = zeros(T, nz)
         for ipx = 1:npx
             for ipy = 1:npy
-                num = tan.(dip).^2 .* sin.(azimuth .- py[ipy]) .* cos.(azimuth .- py[ipy])
-                denom = sqrt.(1 .+ (tan.(dip) .* sin.(azimuth .- py[ipy])).^2)
+                factor = sqrt.( (1 .+ tan.(dip).^2) ./ (1 .+ tan.(dip).^2 .* cos.(azimuth .- py[ipy]).^2) )
                 for iz = 1:nz
-                    shift = compute_shift(iz, ih, ih, ipx, ipy, hx, hy, px, py, dz, num, denom)
+                    shift = compute_shift(iz, ih, ih, ipx, ipy, hx, hy, px, py, dz, factor)
                     if abs(shift) < nz
-                        fill!(holder, zero(T))
-                        holder[iz] = d[iz,ipx,ipy]
-                        acc=_shift_adjoint(holder, shift)
-                        mtap[:,ih] .+= acc
+                        _interpolate_adjoint!(@view(mtap[:,ih]), d[iz,ipx,ipy], iz - shift)
                     end
                 end
             end
@@ -986,22 +985,16 @@ function JopSlantStackShiftSum3D_df_vector′!(m::AbstractArray{T,3}, d::Abstrac
     compute_shift = slantstack_shift_theta_phi_geologic
 
     mtap = zeros(T, nz, nhx, nhy)
-    @threads for ihx = 1:nhx
-        for ihy = 1:nhy
-            acc = zeros(T, nz)
-            holder = zeros(T, nz)
-            for ipx = 1:npx
-                for ipy = 1:npy
-                    num = tan.(dip).^2 .* sin.(azimuth .- py[ipy]) .* cos.(azimuth .- py[ipy])
-                    denom = sqrt.(1 .+ (tan.(dip) .* sin.(azimuth .- py[ipy])).^2)
-                    for iz = 1:nz
-                        shift = compute_shift(iz, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, num, denom)
-                        if abs(shift) < nz
-                            fill!(holder, zero(T))
-                            holder[iz] = d[iz,ipx,ipy]
-                            acc=_shift_adjoint(holder, shift)
-                            mtap[:,ihx,ihy] .+= acc
-                        end
+    @threads for ih = 1:nhx*nhy
+        ihy = div(ih-1, nhx) + 1
+        ihx = mod(ih-1, nhx) + 1
+        for ipx = 1:npx
+            for ipy = 1:npy
+                factor = sqrt.( (1 .+ tan.(dip).^2) ./ (1 .+ tan.(dip).^2 .* cos.(azimuth .- py[ipy]).^2) )
+                for iz = 1:nz
+                    shift = compute_shift(iz, ihx, ihy, ipx, ipy, hx, hy, px, py, dz, factor)
+                    if abs(shift) < nz
+                        _interpolate_adjoint!(@view(mtap[:,ihx,ihy]), d[iz,ipx,ipy], iz - shift)
                     end
                 end
             end
@@ -1011,50 +1004,80 @@ function JopSlantStackShiftSum3D_df_vector′!(m::AbstractArray{T,3}, d::Abstrac
     m
 end
 
-@inline function slantstack_shift_px_py(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, num::Real, denom::Real)
+@inline function slantstack_shift_px_py(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, factor::Real)
     # for time domain, the shift is given by (hx*px + hy*py) / dz
-    shift = + (hx[ihx] * px[ipx] + hy[ihy] * py[ipy]) / dz
-    shift
+    shift = + hx[ihx] * px[ipx] + hy[ihy] * py[ipy]
+    shift / dz
 end
 
-@inline function slantstack_shift_theta_phi(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, num::Real, denom::Real)
+@inline function slantstack_shift_theta_phi(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, factor::Real)
     # for depth domain ignoring geologic dip, the shift is given by (-hx*tan(θ)*cos(ϕ) - hy*tan(θ)*sin(ϕ)) / dz
-    shift = + (hx[ihx] * px[ipx] * cos(py[ipy]) + hy[ihy] * px[ipx] * sin(py[ipy])) / dz
-    shift
+    shift = + hx[ihx] * px[ipx] * cos(py[ipy]) + hy[ihy] * px[ipx] * sin(py[ipy])
+    shift / dz
 end
 
-@inline function slantstack_shift_theta_phi_geologic(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, num::Real, denom::Real)
-    # for depth domain accounting for geologic dip, the shift is given by (-hx*tan(θ)*(cos(ϕ)/denom + sin(ϕ)*num/denom) - hy*tan(θ)*(sin(ϕ)/denom - cos(ϕ)*num/denom)) / dz
-    shift = + hx[ihx] * px[ipx] * (cos(py[ipy]) / denom + sin(py[ipy]) * num / denom) + hy[ihy] * px[ipx] * (sin(py[ipy]) / denom - cos(py[ipy]) * num / denom)
-    shift / dz
+@inline function slantstack_shift_theta_phi_geologic(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, factor::Real)
+    # for depth domain accounting for geologic dip, the shift is given by (-hx*tan(θ)*cos(ϕ)*factor - hy*tan(θ)*sin(ϕ)*factor) / dz
+    shift = + hx[ihx] * px[ipx] * cos(py[ipy]) + hy[ihy] * px[ipx] * sin(py[ipy])
+    shift * factor / dz
 end
 
-@inline function slantstack_shift_theta_phi_geologic(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, num::AbstractArray{T,1}, denom::AbstractArray{T,1}) where {T}
-    # for depth domain accounting for geologic dip, the shift is given by (-hx*tan(θ)*(cos(ϕ)/denom + sin(ϕ)*num/denom) - hy*tan(θ)*(sin(ϕ)/denom - cos(ϕ)*num/denom)) / dz
-    shift = + hx[ihx] * px[ipx] * (cos(py[ipy]) / denom[iz] + sin(py[ipy]) * num[iz] / denom[iz]) + hy[ihy] * px[ipx] * (sin(py[ipy]) / denom[iz] - cos(py[ipy]) * num[iz] / denom[iz])
-    shift / dz
+@inline function slantstack_shift_theta_phi_geologic(iz::Int64, ihx::Int64, ihy::Int64, ipx::Int64, ipy::Int64, hx, hy, px, py, dz, factor::AbstractArray{T,1}) where {T}
+    # for depth domain accounting for geologic dip, the shift is given by (-hx*tan(θ)*cos(ϕ)*factor - hy*tan(θ)*sin(ϕ)*factor) / dz
+    shift = + hx[ihx] * px[ipx] * cos(py[ipy]) + hy[ihy] * px[ipx] * sin(py[ipy])
+    shift * factor[iz] / dz
+end
+
+# helper functions to build a compact sinc kernel for forward and adjoint shifting/interpolation in 1D
+
+function _shift_zeropad(x::AbstractVector{T}, s::Int) where {T}
+    y = zeros(T, length(x))
+    n = length(x)
+    src_lo = max(1, 1 - s)
+    src_hi = min(n, n - s)
+    if src_lo <= src_hi
+        dst_lo = src_lo + s
+        dst_hi = src_hi + s
+        @views y[dst_lo:dst_hi] .= x[src_lo:src_hi]
+    end
+    return y
+end
+
+
+@inline function _hann_local(L::Int, T::Type=Float32)
+    L <= 1 && return ones(T, L)
+    n = collect(0:(L-1))
+    return T(0.5) .- T(0.5) .* cos.(T(2π) .* (n ./ (L-1)))
 end
 
-# helper functions to build a compact sinc kernel for forward and adjoint shifting in 1D
-function _sinc_kernel(δ, N)
+function _sinc_kernel(δ::Real, N::Int)
     n = -(N÷2):(N÷2)
     h = sinc.(n .- δ)
-    h .* hann(length(h))
+    w = _hann_local(length(h), eltype(h))
+    h .* w
 end
 
-function _shift_forward(x, shift, N = 7)
+function _shift_forward!(y::AbstractArray{T,1}, x::AbstractArray{T,1}, shift::Real, N::Int = 7) where {T}
     ishift = round(Int, shift)
     frac = shift - ishift
+    if frac == 0.0
+        y .= _shift_zeropad(x, ishift)
+        return nothing
+    end
     h = _sinc_kernel(frac, N)
     L = length(h) ÷ 2
-    x_int = circshift(x, ishift)
+    x_int = _shift_zeropad(x, ishift)
     x_frac = conv(x_int, h)
-    x_frac[L+1 : L+length(x)]
+    y .= x_frac[L+1 : L+length(x)]
 end
 
-function _shift_adjoint(x, shift, N = 7)
+function _shift_adjoint!(y::AbstractArray{T,1}, x::AbstractArray{T,1}, shift::Real, N::Int = 7) where {T}
     ishift = round(Int, shift)
     frac = shift - ishift
+    if frac == 0.0
+        y .= _shift_zeropad(x, -ishift)
+        return nothing
+    end
     h = reverse(_sinc_kernel(frac, N))
     n = length(x)
     m = length(h)
@@ -1062,5 +1085,48 @@ function _shift_adjoint(x, shift, N = 7)
     x_pad = zeros(eltype(x), n+m-1)
     x_pad[L+1 : L+n] .= x
     x_frac = conv(x_pad, h)
-    circshift(@view(x_frac[m : m+n-1]), -ishift)
+    y .= _shift_zeropad(@view(x_frac[m : m+n-1]), -ishift)
+end
+
+function _interpolate_forward(x::AbstractArray{T,1}, loc::Real, N::Int = 7) where {T}
+    iloc = round(Int, loc)
+    frac = loc - iloc
+    if frac == 0.0 && 1 <= iloc <= length(x)
+        return x[iloc]
+    end
+    
+    h = _sinc_kernel(frac, N)
+    L = length(h) ÷ 2
+    
+    # Apply sinc interpolation at fractional location using direct sum
+    result = zero(T)
+    n = -(N÷2)
+    for weight in h
+        idx = iloc + n
+        if 1 <= idx <= length(x)
+            result += x[idx] * weight
+        end
+        n += 1
+    end
+    result
+end
+
+function _interpolate_adjoint!(x::AbstractArray{T,1}, val::T, loc::Real, N::Int = 7) where {T}
+    iloc = round(Int, loc)
+    frac = loc - iloc
+    
+    if frac == 0.0 && 1 <= iloc <= length(x)
+        x[iloc] += val
+        return nothing
+    end
+    
+    h = _sinc_kernel(frac, N)
+    n = -(N÷2)
+    for weight in h
+        idx = iloc + n
+        if 1 <= idx <= length(x)
+            x[idx] += val * weight
+        end
+        n += 1
+    end
 end
\ No newline at end of file
diff --git a/test/jop_slantstack.jl b/test/jop_slantstack.jl
index 09327b8..bac0d29 100644
--- a/test/jop_slantstack.jl
+++ b/test/jop_slantstack.jl
@@ -22,6 +22,36 @@ end
     @test i[2] == findfirst(x->x≈0, state(A).tant)
 end
 
+@testset "JetSlantStack, correctness advanced" begin
+    h0 = -100
+    dh = 1
+    dz = 1
+    nz = 200
+    h = collect(h0:dh:abs(h0))
+    nh = length(h)
+    og = zeros(Float32, nz, nh)
+    
+    # create a dipping event
+    dip = 20
+    iz0 = div(nz,2)
+    for ih = 1:nh
+        hx = h[ih]
+        z = tan(deg2rad(dip)) * hx + (iz0-1)*dz
+        iz = round(Int, div(z, dz)) + 1
+        og[iz,ih] = 1
+    end
+
+    θ = collect(-30:1:30)
+    
+    A = JopSlantStack(JetSpace(Float32, nz, nh); mode = "depth", theta = θ, dz = dz, dh = dh, h0 = h0)
+    ag = A * og
+
+    argmax_ag = Tuple(argmax(ag))
+    idip = findfirst(==(dip), θ)
+    @show argmax_ag, (iz0, idip)
+    @test argmax_ag == (iz0, idip)
+end
+
 # time mode
 @testset "JetSlantStack, dot product test" for T in (Float32, Float64)
     A = JopSlantStack(JetSpace(T, 64, 128); dz=0.004, dh=10.0, h0=-1000.0, mode="time")
@@ -34,7 +64,7 @@ end
     @test isapprox(lhs,rhs,rtol=1e-4)
 end
 
-@testset "JetSlantStack, correctness" for padz in (0.0, 0.2)
+@testset "JetSlantStack, correctness (time)" for padz in (0.0, 0.2)
     A = JopSlantStack(JetSpace(Float64, 64, 128); dz=0.004, dh=10.0, h0=-1000.0, padz, mode="time")
     m = zeros(domain(A))
     m[32,:] .= 1
@@ -45,7 +75,7 @@ end
 
 # depth-time parity test
 @testset "JetSlantStack, parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     # the ray parameters that would give the same results as theta is
     p = @. -tan(deg2rad(theta))
     
@@ -72,7 +102,7 @@ end
 end
 
 @testset "JetSlantStackShiftSum, parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     # the ray parameters that would give the same results as theta is
     p = @. -tan(deg2rad(theta))
     
@@ -94,7 +124,7 @@ end
 end
 
 @testset "JetSlantStack vs JetSlantStackShiftSum, parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     A1 = JopSlantStack(JetSpace(Float64, 128, 129); mode="depth", theta=theta, dz=0.01, h0=-0.64, dh=0.01)
     A2 = JopSlantStackShiftSum(JetSpace(Float64, 128, 129); mode="depth" , theta=theta, dz=0.01, h=collect(-0.64:0.01:0.64))
     m = zeros(domain(A1))
@@ -113,7 +143,7 @@ end
     @test similarity > 0.95
 end
 
-@testset "JetSlantStack3D, dot product test, T = $(T), mode = $(mode), dip = $(dip)" for T in (Float32, Float64), mode in ("depth", "time"), (dip, azimuth) in ((0.0,0.0), (30.0, 60.0))
+@testset "JetSlantStack3D, dot product test, T = $(T), mode = $(mode), dip = $(dip)" for T in (Float32, Float64), mode in ("depth", "time"), (dip, azimuth) in ((0.0,0.0), (20.0, 30.0))
     Random.seed!(1234)
     A = JopSlantStack3D(JetSpace(T, 32, 8, 5); dz=10.0, dhx=10.0, dhy=5.0, hx0=-1000.0, hy0=-500.0, mode=mode, dip=dip, azimuth=azimuth, taperz=(0.3,0.3), taperhx=(0.3,0.3), taperhy=(0.3,0.3), taperkz=(0.3,0.3), taperkhx=(0.3,0.3), taperkhy=(0.3,0.3))
 
@@ -126,8 +156,47 @@ end
     @test isapprox(lhs,rhs,rtol=1e-4)
 end
 
+@testset "JetSlantStack3D correctness" begin
+    hx0 = -100
+    hy0 = -100
+    dhx = 1
+    dhy = 1
+    dz = 1
+    nz = 200
+    nx = 1
+    ny = 1
+    hx = collect(hx0:dhx:abs(hx0))
+    hy = collect(hy0:dhy:abs(hy0))
+    nhx = length(hx)
+    nhy = length(hy)
+    og = zeros(Float32, nz, ny, nx, nhx, nhy)
+    
+    # create a dipping event
+    dip = 20
+    azimuth = 45
+    iz0 = div(nz,2)
+    for ihx = 1:nhx
+        for ihy = 1:nhy
+            z = tan(deg2rad(dip)) * (cos(deg2rad(azimuth)) * hx[ihx] + sin(deg2rad(azimuth)) * hy[ihy]) + (iz0-1)*dz
+            iz = round(Int, div(z, dz)) + 1
+            og[iz,1,1,ihx,ihy] = 1
+        end
+    end
+
+    θ = collect(-30:1:30)
+    ϕ = collect(0:45:135)
+    A = JopSlantStack3D(JetSpace(Float32, nz, ny, nx, nhx, nhy); mode = "depth", theta = θ, phi = ϕ, dz = dz, dhx = dhx, dhy = dhy, hx0 = hx0, hy0 = hy0)
+    ag = A * og
+
+    argmax_ag = Tuple(argmax(ag))
+    idip = findfirst(==(dip), θ)
+    iaz = findfirst(==(azimuth), ϕ)
+    @show argmax_ag, (iz0, 1, 1, idip, iaz)
+    @test argmax_ag == (iz0, 1, 1, idip, iaz)
+end
+
 @testset "JetSlantStack3D, time-depth parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     phi = [0.0, 90.0]
     
     # the ray parameters that would give the same results as theta/phi are
@@ -157,7 +226,7 @@ end
 end
 
 @testset "JetSlantStack3D, 3D vs 5D parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     phi = [0.0]
     px = @. -tan(deg2rad(theta))
     py = [0.0]
@@ -183,7 +252,7 @@ end
 end
 
 @testset "JetSlantStack3D, dip invariance" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     phi = [0.0]
     
     A1 = JopSlantStack3D(JetSpace(Float64, 128, 129, 5); mode="depth", theta=theta, phi=phi, dz=0.01, hx0=-0.64, dhx=0.01, hy0=0, dhy=0.01, dip=0.0, azimuth=0.0)
@@ -219,7 +288,7 @@ end
 end
 
 @testset "JetSlantStackShiftSum3D, time-depth parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     phi = [0.0, 90.0]
     
     # the ray parameters that would give the same results as theta/phi are
@@ -248,14 +317,14 @@ end
 end
 
 @testset "JetSlantStackShiftSum3D, dip parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-10:1.0:15)
     phi = collect(0.0:45.0:135.0)
     
     A1 = JopSlantStackShiftSum3D(JetSpace(Float64, 64, 129); mode="depth", theta=theta, phi = phi, dz=0.01, hx=collect(-0.64:0.01:0.64), hy=collect(-0.64:0.01:0.64), dip = 0.0, azimuth = 0.0)
     A2 = JopSlantStackShiftSum3D(JetSpace(Float64, 64, 129); mode="depth", theta=theta, phi = phi, dz=0.01, hx=collect(-0.64:0.01:0.64), hy=collect(-0.64:0.01:0.64), dip = zeros(64), azimuth = zeros(64))
     m = zeros(domain(A1))
     for i = 1:129
-        m[div(i,4)+20,i] = 1.0
+        m[26+div(i,10),i] = 1.0
     end
     m[16,64] = 1
     m[32,96] = 1
@@ -280,7 +349,7 @@ end
 end
 
 @testset "JetSlantStackShiftSum3D, dip invariance" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-10:1.0:15)
     phi = [0.0]
     
     A1 = JopSlantStackShiftSum3D(JetSpace(Float64, 64, 129); mode="depth", theta=theta, phi = phi, dz=0.01, hx=collect(-0.64:0.01:0.64), hy=collect(-0.64:0.01:0.64), dip = 0.0, azimuth = 0.0)
@@ -288,7 +357,7 @@ end
 
     m = zeros(domain(A1))
     for i = 1:129
-        m[div(i,4)+20,i] = 1.0
+        m[26+div(i,10),i] = 1.0
     end
     m[16,64] = 1
     m[32,96] = 1
@@ -302,7 +371,7 @@ end
 end
 
 @testset "JetSlantStackShiftSum3D, offsets parity" begin
-    theta = collect(-45:5.0:45)
+    theta = collect(-35:5.0:45)
     phi = collect(0.0:45.0:135.0)
 
     hx_reg = collect(-0.6:0.1:0.6)
@@ -332,8 +401,8 @@ end
     @test err_d < 1e-7
 end
 
-@testset "JetSlantStack3D vs JetSlantStackShiftSum3D, parity" for dip in (0.0, 10.0)
-    theta = collect(-45:1.0:45)
+@testset "JetSlantStack3D vs JetSlantStackShiftSum3D, parity" for dip in (0.0, 60.0)
+    theta = collect(-45:1.0:65)
     phi = collect(0.0:45.0:135.0)
     azimuth = 45.0
     A1 = JopSlantStack3D(JetSpace(Float64, 128, 129, 5); mode="depth", theta=theta, phi=phi, dz=0.01, hx0=-0.64, dhx=0.01, hy0=0, dhy=0.01, dip=dip, azimuth=azimuth)
@@ -341,10 +410,12 @@ end
     m = zeros(domain(A1))
     for i = 1:129
         m[div(i,4)+20,i,1] = 1.0
-        m[div(i,3)+10,i,2] = -0.5
+        m[div(i,3)+10,i,2] = 1.0
     end
     m[16,64,1] = 1
-    m[64,96,5] = -0.2
+    m[64,96,1] = 1
+
+    # @code_warntype d2 = A2*m
     d1 = A1*m
     d2 = A2*m
 
@@ -356,7 +427,7 @@ end
 end
 
 @testset "JopSlantStackShiftSum vs JopSlantStackShiftSum3D, parity" begin
-    theta = collect(-45:1.0:45)
+    theta = collect(-35:1.0:45)
     phi = [0.0]
     A1 = JopSlantStackShiftSum(JetSpace(Float64, 128, 129); mode="depth" , theta=theta, dz=0.01, h=collect(-0.64:0.01:0.64))
     A2 = JopSlantStackShiftSum3D(JetSpace(Float64, 128, 129, 5); mode="depth" , theta=theta, phi=phi, dz=0.01, hx=collect(-0.64:0.01:0.64), hy=collect(0:0.01:0.04))