gDel3D-python/benchmark_host_device.py at master · blancocd/gDel3D-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torch
import gdel3d
import time

# gdel3d has two bindings
# one uses their code and expects cpu tensors so the binding needs to make sure tensor is on cpu
    # furthermore it
# the other one leaves the data in gpu avoiding repeated transfers back and forth to cpu
def benchmark():
    N = 700_000
    points_cuda = torch.rand((N, 3), dtype=torch.float32, device='cuda')
    torch.cuda.synchronize()

    # Warming up kernels
    _ = gdel3d.triangulate(points_cuda[:1000])

    # Timing events
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    # Measuring triangulate_host given GPU tensors
    start_event.record()
    tets_final_host = gdel3d.triangulate_host(points_cuda)
    end_event.record()
    torch.cuda.synchronize()
    total_host = start_event.elapsed_time(end_event)


    # Warming up kernels
    _ = gdel3d.triangulate_device(points_cuda[:1000])
    torch.cuda.synchronize()

    start_event.record()
    tets_final_device = gdel3d.triangulate_device(points_cuda)
    end_event.record()
    torch.cuda.synchronize()
    total_device = start_event.elapsed_time(end_event)


    speedup = total_host / total_device
    print(f"Host Total time:   {total_host:.2f} ms", flush=True)
    print(f"Device Total time: {total_device:.2f} ms", flush=True)
    print(f"Speedup:      {speedup:.2f}x", flush=True)

    # Verify counts roughly match
    diff = abs(tets_final_host.shape[0] - tets_final_device.shape[0])
    print(f"\nTetrahedron count difference: {diff} (Due to GPU non-determinism)", flush=True)

if __name__ == "__main__":
    benchmark()