Skip to content

Commit ad7a8c5

Browse files
Fixes for the following bugs:
- Bug 1 fix (AgentManagerImpl.java) — GlobalLock.isLockAvailable() now only runs when the host status is not alive. This eliminates one IS_FREE_LOCK DB query per ping per healthy host, which is the direct cause of listHosts/listNetworks degradation. - Bug 2 fix (HostConnectProcess.java) — shutdown() → shutdownNow(). Old thread pools from prior connect cycles are now interrupted immediately instead of draining their queued tasks, preventing thread accumulation during reconnect storms. - Bug 3 fix (AgentManagerImpl.java) — Lock timeout in handleDisconnectWithoutInvestigation now logs a warn instead of silently discarding the disconnect event. - Bug 4 fix (ServerAttache.java) — Alarm ScheduledFuture handles are now tracked in _alarmFutures and cancelled when the corresponding listener is unregistered or all commands are cancelled on disconnect. - Bug 5 fix (AgentAttache.java) - Fix Alarm ScheduledFuture handles in AgentAttache as well
1 parent 25275a3 commit ad7a8c5

4 files changed

Lines changed: 35 additions & 6 deletions

File tree

agent/src/main/java/com/cloud/agent/HostConnectProcess.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ public boolean stop() {
9494

9595
private void stopHostStatusExecutor() {
9696
if (hostStatusExecutor != null) {
97-
hostStatusExecutor.shutdown();
97+
hostStatusExecutor.shutdownNow();
9898
hostStatusExecutor = null;
9999
}
100100
}

agent/src/main/java/com/cloud/agent/ServerAttache.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ public class ServerAttache {
8686
protected String _name;
8787
private Link _link;
8888
protected ConcurrentHashMap<Long, ServerListener> _waitForList;
89+
protected ConcurrentHashMap<Long, java.util.concurrent.ScheduledFuture<?>> _alarmFutures;
8990
protected LinkedList<Request> _requests;
9091
protected Long _currentSequence;
9192
protected long _nextSequence;
@@ -94,6 +95,7 @@ protected ServerAttache(Link link) {
9495
_name = link.getIpAddress();
9596
_link = link;
9697
_waitForList = new ConcurrentHashMap<>();
98+
_alarmFutures = new ConcurrentHashMap<>();
9799
_requests = new LinkedList<>();
98100
_nextSequence = Long.valueOf(s_rand.nextInt(Short.MAX_VALUE)) << 48;
99101
}
@@ -148,7 +150,9 @@ protected void registerListener(long seq, ServerListener listener) {
148150
logger.trace(log(seq, "Registering listener"));
149151
}
150152
if (listener.getTimeout() != -1) {
151-
s_listenerExecutor.schedule(new Alarm(seq), listener.getTimeout(), TimeUnit.SECONDS);
153+
java.util.concurrent.ScheduledFuture<?> alarmFuture =
154+
s_listenerExecutor.schedule(new Alarm(seq), listener.getTimeout(), TimeUnit.SECONDS);
155+
_alarmFutures.put(seq, alarmFuture);
152156
}
153157
_waitForList.put(seq, listener);
154158
}
@@ -157,6 +161,10 @@ protected ServerListener unregisterListener(long sequence) {
157161
if (logger.isTraceEnabled()) {
158162
logger.trace(log(sequence, "Unregistering listener"));
159163
}
164+
java.util.concurrent.ScheduledFuture<?> alarmFuture = _alarmFutures.remove(sequence);
165+
if (alarmFuture != null) {
166+
alarmFuture.cancel(false);
167+
}
160168
return _waitForList.remove(sequence);
161169
}
162170

@@ -205,8 +213,13 @@ protected void cancelAllCommands() {
205213
Map.Entry<Long, ServerListener> entry = it.next();
206214
it.remove();
207215

216+
long seq = entry.getKey();
217+
java.util.concurrent.ScheduledFuture<?> alarmFuture = _alarmFutures.remove(seq);
218+
if (alarmFuture != null) {
219+
alarmFuture.cancel(false);
220+
}
208221
ServerListener monitor = entry.getValue();
209-
logger.debug(log(entry.getKey(), "Sending disconnect to " + monitor.getClass()));
222+
logger.debug(log(seq, "Sending disconnect to " + monitor.getClass()));
210223
monitor.processDisconnect();
211224
}
212225
}

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentAttache.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ public int compare(final Object o1, final Object o2) {
120120
protected String _name = null;
121121
protected HypervisorType _hypervisorType;
122122
protected final ConcurrentHashMap<Long, Listener> _waitForList;
123+
protected ConcurrentHashMap<Long, java.util.concurrent.ScheduledFuture<?>> _alarmFutures;
123124
protected final LinkedList<Request> _requests;
124125
protected Long _currentSequence;
125126
protected Status _status = Status.Connecting;
@@ -146,6 +147,7 @@ protected AgentAttache(final AgentManagerImpl agentMgr, final long id, final Str
146147
_name = name;
147148
_hypervisorType = hypervisorType;
148149
_waitForList = new ConcurrentHashMap<Long, Listener>();
150+
_alarmFutures = new ConcurrentHashMap<>();
149151
_currentSequence = null;
150152
_maintenance = maintenance;
151153
_requests = new LinkedList<Request>();
@@ -241,13 +243,19 @@ protected synchronized int findRequest(final long seq) {
241243
protected void registerListener(final long seq, final Listener listener) {
242244
logger.trace(LOG_SEQ_FORMATTED_STRING, seq, "Registering listener");
243245
if (listener.getTimeout() != -1) {
244-
s_listenerExecutor.schedule(new Alarm(seq), listener.getTimeout(), TimeUnit.SECONDS);
246+
java.util.concurrent.ScheduledFuture<?> alarmFuture =
247+
s_listenerExecutor.schedule(new Alarm(seq), listener.getTimeout(), TimeUnit.SECONDS);
248+
_alarmFutures.put(seq, alarmFuture);
245249
}
246250
_waitForList.put(seq, listener);
247251
}
248252

249253
protected Listener unregisterListener(final long sequence) {
250254
logger.trace(LOG_SEQ_FORMATTED_STRING, sequence, "Unregistering listener");
255+
java.util.concurrent.ScheduledFuture<?> alarmFuture = _alarmFutures.remove(sequence);
256+
if (alarmFuture != null) {
257+
alarmFuture.cancel(false);
258+
}
251259
return _waitForList.remove(sequence);
252260
}
253261

@@ -338,8 +346,13 @@ protected void cancelAllCommands(final Status state, final boolean cancelActive)
338346
while (it.hasNext()) {
339347
final Map.Entry<Long, Listener> entry = it.next();
340348
it.remove();
349+
long seq = entry.getKey();
350+
java.util.concurrent.ScheduledFuture<?> alarmFuture = _alarmFutures.remove(seq);
351+
if (alarmFuture != null) {
352+
alarmFuture.cancel(false);
353+
}
341354
final Listener monitor = entry.getValue();
342-
logger.debug(LOG_SEQ_FORMATTED_STRING, entry.getKey(), "Sending disconnect to " + monitor.getClass());
355+
logger.debug(LOG_SEQ_FORMATTED_STRING, seq, "Sending disconnect to " + monitor.getClass());
343356
monitor.processDisconnect(_id, _uuid, _name, state);
344357
}
345358
}

engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1409,6 +1409,9 @@ protected boolean handleDisconnectWithoutInvestigation(final AgentAttache attach
14091409
logger.debug("Handling disconnect without investigation for {}", attache);
14101410
if (joinLock.lock(getTimeoutSec())) {
14111411
result = disconnectAgent(attache, event, transitState, hostId, joinLock);
1412+
} else {
1413+
logger.warn("Failed to acquire join lock for {} within timeout, " +
1414+
"disconnect event {} may be lost; another MS node may be processing it", attache, event);
14121415
}
14131416
} finally {
14141417
joinLock.releaseRef();
@@ -2269,8 +2272,8 @@ protected void processRequest(final Link link, final Request request) {
22692272
private boolean sendRequestStartupCommand(long hostId, HostVO host) {
22702273
boolean requestStartup = false;
22712274
Set<Status> aliveStatuses = getAliveHostStatuses();
2272-
boolean lockAvailable = GlobalLock.isLockAvailable(getHostJoinLockName(hostId));
22732275
if (host.getStatus() == null || !aliveStatuses.contains(host.getStatus())) {
2276+
boolean lockAvailable = GlobalLock.isLockAvailable(getHostJoinLockName(hostId));
22742277
if (!lockAvailable) {
22752278
logger.debug("Ping from host {}: requesting startup command " +
22762279
"due to host status ({}) is not considered as alive ({}), " +

0 commit comments

Comments
 (0)