-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathservoing.py
More file actions
90 lines (71 loc) · 3.59 KB
/
Copy pathservoing.py
File metadata and controls
90 lines (71 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
servoing.py — pixel-space to robot-space math for visual servoing.
All functions work with the coordinate conventions of FrankaCubePushEnv:
- Camera looks along its -Z axis (points in front have p_cam[2] < 0)
- world_to_pixel: u = -f*p_cam[0]/p_cam[2] + W/2
v = H - 1 - (f*p_cam[1]/p_cam[2] + H/2)
"""
import numpy as np
def pixel_error_to_robot_delta(e_u, e_v, cam_pos, cam_R, eef_pos, f, Kp=1.0):
"""
Convert 2D pixel error (target - current) to world-frame XY delta.
Builds the full 2x2 image Jacobian J = d(u,v)/d(world_x, world_y) and
solves J @ [dx, dy] = [e_u, e_v]. The naive depth-only formula fails for
tilted cameras because world-Y movement also changes the camera depth
p_cam[2], so both projection terms contribute.
Derivation (camera looks along -Z; pixel u, v are in screen convention,
where v_screen = H-1-v_gl with v_gl = -f*p_cam[1]/depth + H/2):
du = -f*dp[0]/depth + f*p[0]*dp[2]/depth^2 # u == u_gl
dv_s = f*dp[1]/depth - f*p[1]*dp[2]/depth^2 # screen v
where dp = cam_R.T @ dP_world, dP_world = [dx, dy, 0].
Args:
e_u, e_v: pixel error = target_pixel - current_eef_pixel
cam_pos: (3,) camera position in world frame
cam_R: (3,3) camera-to-world rotation (columns: right, up, -forward)
eef_pos: (3,) current EEF world position
f: focal length in pixels
Kp: proportional gain (reduce if oscillating)
Returns:
(3,) world-frame delta [dx, dy, 0]
"""
p_cam = cam_R.T @ (eef_pos - cam_pos)
depth = p_cam[2] # negative for points in front of camera
# Build 2x2 image Jacobian: J[r, i] = d(pixel_r) / d(world_i), i in {x, y}
R_inv = cam_R.T # world-to-camera rotation
J = np.zeros((2, 2))
for i in range(2): # i=0 -> world X, i=1 -> world Y
J[0, i] = (-f * R_inv[0, i] / depth
+ f * p_cam[0] * R_inv[2, i] / depth**2)
# Screen-v gradient has opposite sign from v_gl gradient.
J[1, i] = ( f * R_inv[1, i] / depth
- f * p_cam[1] * R_inv[2, i] / depth**2)
try:
delta_xy = np.linalg.solve(J, np.array([float(e_u), float(e_v)]))
except np.linalg.LinAlgError:
return np.zeros(3)
return Kp * np.array([delta_xy[0], delta_xy[1], 0.0])
def pixel_to_world_at_z(u, v, z_world, cam_pos, cam_R, f, W, H):
"""
Unproject screen pixel (u, v) to the 3D world point at height z_world.
Exact inverse of FrankaCubePushEnv.world_to_pixel().
Args:
u, v: integer pixel coordinates (top-left origin, v increases down)
z_world: target world Z plane to intersect
cam_pos: (3,) camera position in world frame
cam_R: (3,3) camera-to-world rotation
f: focal length in pixels
W, H: image width and height in pixels
Returns:
(3,) world point on the plane z = z_world whose projection is (u, v)
"""
v_gl = H - 1 - v # flip to OpenGL convention (origin at bottom-left)
# Ray direction in camera frame (unnormalised; depth is the free parameter).
# Inverts world_to_pixel: u_gl = -f*p[0]/d + W/2, v_gl = -f*p[1]/d + H/2.
ax = -(u - W / 2.0) / f # p_cam[0] = ax * depth
ay = -(v_gl - H / 2.0) / f # p_cam[1] = ay * depth
# Solve for depth so that the world Z coordinate equals z_world:
# cam_pos[2] + (cam_R @ [ax*d, ay*d, d])[2] = z_world
denom = cam_R[2, 0] * ax + cam_R[2, 1] * ay + cam_R[2, 2]
depth = (z_world - cam_pos[2]) / denom
p_cam = np.array([ax * depth, ay * depth, depth])
return cam_pos + cam_R @ p_cam