gDel3D-python/binding.cu at master · blancocd/gDel3D-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#include <torch/extension.h>
#include <vector>
#include <iostream>
#include <cstring>

#include "GDelFlipping/src/gDel3D/GpuDelaunay.h"
#include "GDelFlipping/src/gDel3D/CommonTypes.h"

// Takes in points in either CPU/GPU, copies them to CPU as that's what compute expects
// and returns in the device it came from.
torch::Tensor triangulate_host(torch::Tensor points) {
    TORCH_CHECK(points.dim() == 2 && points.size(1) == 3, "Points shape must be (N, 3)");

    // Default impl expects points to be in host
    torch::Tensor points_cpu = points.to(torch::kCPU).to(torch::kFloat64).contiguous();

    int num_points = points_cpu.size(0);
    double* data_ptr = points_cpu.data_ptr<double>();

    std::vector<Point3> pointVec(num_points);
    std::memcpy(pointVec.data(), data_ptr, num_points * sizeof(Point3));

    GDelParams params;
    params.noSorting = false;
    params.noSplaying = true;

    GpuDel gpuDel(params);
    GDelOutput output;

    gpuDel.compute(pointVec, &output);

    int num_tets = output.tetVec.size();
    auto options = torch::TensorOptions().dtype(torch::kInt32);
    torch::Tensor tets = torch::empty({num_tets, 4}, options);

    std::memcpy(tets.data_ptr<int>(), output.tetVec.data(), num_tets * sizeof(Tet));

    return tets.to(points.device());
}

// expects points to be in GPU, skips CPU transfers for better speed
torch::Tensor triangulate_device(torch::Tensor points) {
    TORCH_CHECK(points.device().is_cuda(), "Points must be a CUDA tensor");
    TORCH_CHECK(points.dim() == 2 && points.size(1) == 3, "Points must be (N, 3)");

    // Float64 is not that much slower and returns less failed faces
    torch::Tensor points_double;
    if (points.dtype() == torch::kFloat32) {
        points_double = points.to(torch::kFloat64);
    } else {
        TORCH_CHECK(points.dtype() == torch::kFloat64, "Points must be float32 or float64");
        points_double = points;
    }

    // Ensure contiguous as our computeDevice requires it
    points_double = points_double.contiguous();

    int num_points = points_double.size(0);
    const double* d_points = points_double.data_ptr<double>();

    // Make extra sure there's enough allocation for output
    // Allocating space here and not inside routine so VRAM is automatically freed
    int max_tets = num_points * 10;
    auto options = torch::TensorOptions().dtype(torch::kInt32).device(points.device());
    torch::Tensor tets = torch::empty({max_tets, 4}, options);
    int* d_tets = tets.data_ptr<int>();

    GDelParams params;
    params.noSorting = false;
    params.noSplaying = true;

    GpuDel gpuDel(params);

    int num_actual_tets = 0;
    gpuDel.computeDevice(d_points, num_points, d_tets, num_actual_tets);

    return tets.slice(0, 0, num_actual_tets);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("triangulate_host", &triangulate_host, "GpuDelaunay (CPU original Pipeline)");
  m.def("triangulate_device", &triangulate_device, "GpuDelaunay (GPU Pipeline)");
  m.def("triangulate", &triangulate_device, "Default Triangulation (GPU Pipeline)");
}