-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbinding.cu
More file actions
84 lines (63 loc) · 3.01 KB
/
binding.cu
File metadata and controls
84 lines (63 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#include <torch/extension.h>
#include <vector>
#include <iostream>
#include <cstring>
#include "GDelFlipping/src/gDel3D/GpuDelaunay.h"
#include "GDelFlipping/src/gDel3D/CommonTypes.h"
// Takes in points in either CPU/GPU, copies them to CPU as that's what compute expects
// and returns in the device it came from.
torch::Tensor triangulate_host(torch::Tensor points) {
TORCH_CHECK(points.dim() == 2 && points.size(1) == 3, "Points shape must be (N, 3)");
// Default impl expects points to be in host
torch::Tensor points_cpu = points.to(torch::kCPU).to(torch::kFloat64).contiguous();
int num_points = points_cpu.size(0);
double* data_ptr = points_cpu.data_ptr<double>();
std::vector<Point3> pointVec(num_points);
std::memcpy(pointVec.data(), data_ptr, num_points * sizeof(Point3));
GDelParams params;
params.noSorting = false;
params.noSplaying = true;
GpuDel gpuDel(params);
GDelOutput output;
gpuDel.compute(pointVec, &output);
int num_tets = output.tetVec.size();
auto options = torch::TensorOptions().dtype(torch::kInt32);
torch::Tensor tets = torch::empty({num_tets, 4}, options);
std::memcpy(tets.data_ptr<int>(), output.tetVec.data(), num_tets * sizeof(Tet));
return tets.to(points.device());
}
// expects points to be in GPU, skips CPU transfers for better speed
torch::Tensor triangulate_device(torch::Tensor points) {
TORCH_CHECK(points.device().is_cuda(), "Points must be a CUDA tensor");
TORCH_CHECK(points.dim() == 2 && points.size(1) == 3, "Points must be (N, 3)");
// Float64 is not that much slower and returns less failed faces
torch::Tensor points_double;
if (points.dtype() == torch::kFloat32) {
points_double = points.to(torch::kFloat64);
} else {
TORCH_CHECK(points.dtype() == torch::kFloat64, "Points must be float32 or float64");
points_double = points;
}
// Ensure contiguous as our computeDevice requires it
points_double = points_double.contiguous();
int num_points = points_double.size(0);
const double* d_points = points_double.data_ptr<double>();
// Make extra sure there's enough allocation for output
// Allocating space here and not inside routine so VRAM is automatically freed
int max_tets = num_points * 10;
auto options = torch::TensorOptions().dtype(torch::kInt32).device(points.device());
torch::Tensor tets = torch::empty({max_tets, 4}, options);
int* d_tets = tets.data_ptr<int>();
GDelParams params;
params.noSorting = false;
params.noSplaying = true;
GpuDel gpuDel(params);
int num_actual_tets = 0;
gpuDel.computeDevice(d_points, num_points, d_tets, num_actual_tets);
return tets.slice(0, 0, num_actual_tets);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("triangulate_host", &triangulate_host, "GpuDelaunay (CPU original Pipeline)");
m.def("triangulate_device", &triangulate_device, "GpuDelaunay (GPU Pipeline)");
m.def("triangulate", &triangulate_device, "Default Triangulation (GPU Pipeline)");
}