Skip to content

Commit dfcec85

Browse files
FR77: Improve Management Server Maintenance (apache#480)
* Safely shutdown feature (ref: apache#6755) * Updated version and some improvements * Management Server Maintenance - Prepare and Cancel Maintenance changes This is supported for the Cloudstack deployments with multiple management servers. - During preparing for maintenance, MS waits for pending jobs to finish, and then Transfer/Migrate the agents to other available MS - New APIs: prepareForMaintenance, cancelMaintenance - New MS States: PreparingToMaintenance, Maintenance * check for single active management server * refactoring plugin name * updated version, and cleanup * code improvements * support list hosts by management server id * update ui with ms maintenance apis * code improvements * ui changes * ui icons update * ui fixes * cond checks for maintenance and shutdown * fix for management server not down issue on service stop * continue with other components on error * agent transfer fixes * maintenance window timeout and fixes * ui changes - added connected agents tab, and updated hosts & management servers fields * marvin test update * keep maintenance after shutdown/restart, do not update last_updated time in cluster heartbeat during maintenance (notifies node inactive/down after heartbeat threshold) * listener for ms maintenance updates * cleanup * keep last msid in host table * review comments * allow only one mgmt server to prepare for maintenance * added ms uuid in logs * minor code improvements * ui fields update * fix systemvm navigation in connected agents * algorithm check and input from ui * check for active ms from host setting * agent migration code improvements * minor ui label fix * fixes & code improvements * agent reconnect fixes, consider avoid list * ui fixes * direct agents transfer and pending jobs timer task fixes * close unclosed socket channels if any * Updated pending jobs check timer task with ScheduledExecutorService * fixes * keep maintenance state on trigger shutdown call when ms is in maintenance * direct agent transfer fixes * add pending jobs count to ms response * during ms heartbeat, update state to up only when it's down * allow vm work jobs of async job created before prepare for maintenance * Revert "keep maintenance state on trigger shutdown call when ms is in maintenance" This reverts commit 4ebbea71ef20a65286bed41a517f03e253a8fe90. * removed duplicate schema changes from schema-41800to41810.sql (already defined at schema-41811to41812.sql)
1 parent 0cb63a1 commit dfcec85

116 files changed

Lines changed: 4094 additions & 326 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

agent/src/main/java/com/cloud/agent/Agent.java

Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.nio.channels.ClosedChannelException;
2828
import java.nio.charset.Charset;
2929
import java.util.ArrayList;
30+
import java.util.Arrays;
3031
import java.util.HashMap;
3132
import java.util.List;
3233
import java.util.Map;
@@ -40,6 +41,8 @@
4041

4142
import javax.naming.ConfigurationException;
4243

44+
import com.cloud.agent.api.MigrateAgentConnectionAnswer;
45+
import com.cloud.agent.api.MigrateAgentConnectionCommand;
4346
import com.cloud.resource.AgentStatusUpdater;
4447
import com.cloud.resource.ResourceStatusUpdater;
4548
import com.cloud.agent.api.PingAnswer;
@@ -307,7 +310,6 @@ public void start() {
307310
}
308311
_shell.updateConnectedHost();
309312
scavengeOldAgentObjects();
310-
311313
}
312314

313315
public void stop(final String reason, final String detail) {
@@ -457,13 +459,18 @@ public synchronized void lockStartupTask(final Link link) {
457459
}
458460

459461
public void sendStartup(final Link link) {
462+
sendStartup(link, false);
463+
}
464+
465+
public void sendStartup(final Link link, boolean transfer) {
460466
final StartupCommand[] startup = _resource.initialize();
461467
if (startup != null) {
462468
final String msHostList = _shell.getPersistentProperty(null, "host");
463469
final Command[] commands = new Command[startup.length];
464470
for (int i = 0; i < startup.length; i++) {
465471
setupStartupCommand(startup[i]);
466472
startup[i].setMSHostList(msHostList);
473+
startup[i].setConnectionTransferred(transfer);
467474
commands[i] = startup[i];
468475
}
469476
final Request request = new Request(_id != null ? _id : -1, -1, commands, false, false);
@@ -516,9 +523,14 @@ public Task create(final Task.Type type, final Link link, final byte[] data) {
516523
}
517524

518525
protected void reconnect(final Link link) {
519-
if (!_reconnectAllowed) {
526+
reconnect(link, null, null, false);
527+
}
528+
529+
protected void reconnect(final Link link, String preferredHost, List<String> avoidHostList, boolean forTransfer) {
530+
if (!(forTransfer || _reconnectAllowed)) {
520531
return;
521532
}
533+
522534
synchronized (this) {
523535
if (_startup != null) {
524536
_startup.cancel();
@@ -550,22 +562,29 @@ protected void reconnect(final Link link) {
550562
_shell.getBackoffAlgorithm().waitBeforeRetry();
551563
}
552564

565+
String host = preferredHost;
566+
if (StringUtils.isEmpty(host)) {
567+
host = _shell.getNextHost();
568+
}
569+
553570
do {
554-
final String host = _shell.getNextHost();
555-
_connection = new NioClient("Agent", host, _shell.getPort(), _shell.getWorkers(), this);
556-
s_logger.info("Reconnecting to host:" + host);
557-
try {
558-
_connection.start();
559-
} catch (final NioConnectionException e) {
560-
s_logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
561-
_connection.stop();
571+
if (CollectionUtils.isEmpty(avoidHostList) || !avoidHostList.contains(host)) {
572+
_connection = new NioClient("Agent", host, _shell.getPort(), _shell.getWorkers(), this);
573+
s_logger.info("Reconnecting to host:" + host);
562574
try {
563-
_connection.cleanUp();
564-
} catch (final IOException ex) {
565-
s_logger.warn("Fail to clean up old connection. " + ex);
575+
_connection.start();
576+
} catch (final NioConnectionException e) {
577+
s_logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
578+
_connection.stop();
579+
try {
580+
_connection.cleanUp();
581+
} catch (final IOException ex) {
582+
s_logger.warn("Fail to clean up old connection. " + ex);
583+
}
566584
}
567585
}
568586
_shell.getBackoffAlgorithm().waitBeforeRetry();
587+
host = _shell.getNextHost();
569588
} while (!_connection.isStartup());
570589
_shell.updateConnectedHost();
571590
s_logger.info("Connected to the host: " + _shell.getConnectedHost());
@@ -674,6 +693,8 @@ protected void processRequest(final Request request, final Link link) {
674693
}
675694
} else if (cmd instanceof SetupMSListCommand) {
676695
answer = setupManagementServerList((SetupMSListCommand) cmd);
696+
} else if (cmd instanceof MigrateAgentConnectionCommand) {
697+
answer = migrateAgentToOtherMS((MigrateAgentConnectionCommand) cmd);
677698
} else {
678699
if (cmd instanceof ReadyCommand) {
679700
processReadyCommand(cmd);
@@ -829,6 +850,53 @@ private Answer setupManagementServerList(final SetupMSListCommand cmd) {
829850
return new SetupMSListAnswer(true);
830851
}
831852

853+
private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
854+
try {
855+
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
856+
processManagementServerList(cmd.getMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
857+
}
858+
migrateAgentConnection(cmd.getAvoidMsList());
859+
} catch (Exception e) {
860+
String errMsg = "Migrate agent connection failed, due to " + e.getMessage();
861+
s_logger.debug(errMsg, e);
862+
return new MigrateAgentConnectionAnswer(errMsg);
863+
}
864+
return new MigrateAgentConnectionAnswer(true);
865+
}
866+
867+
private void migrateAgentConnection(List<String> avoidMsList) {
868+
final String[] msHosts = _shell.getHosts();
869+
if (msHosts == null || msHosts.length < 1) {
870+
throw new CloudRuntimeException("Management Server hosts empty, not properly configured in agent");
871+
}
872+
873+
List<String> msHostsList = new ArrayList<>(Arrays.asList(msHosts));
874+
msHostsList.removeAll(avoidMsList);
875+
if (msHostsList.isEmpty() || StringUtils.isEmpty(msHostsList.get(0))) {
876+
throw new CloudRuntimeException("No other Management Server hosts to migrate");
877+
}
878+
879+
String preferredHost = null;
880+
for (String msHost : msHostsList) {
881+
try (final Socket socket = new Socket()) {
882+
socket.connect(new InetSocketAddress(msHost, _shell.getPort()), 5000);
883+
preferredHost = msHost;
884+
break;
885+
} catch (final IOException e) {
886+
throw new CloudRuntimeException("Management server host: " + msHost + " is not reachable, to migrate connection");
887+
}
888+
}
889+
890+
if (preferredHost == null) {
891+
throw new CloudRuntimeException("Management server host(s) are not reachable, to migrate connection");
892+
}
893+
894+
s_logger.debug("Management server host " + preferredHost + " is found to be reachable, trying to reconnect");
895+
_shell.resetHostCounter();
896+
_shell.setConnectionTransfer(true);
897+
reconnect(_link, preferredHost, avoidMsList, true);
898+
}
899+
832900
public void processResponse(final Response response, final Link link) {
833901
final Answer answer = response.getAnswer();
834902
if (s_logger.isDebugEnabled()) {
@@ -1122,7 +1190,8 @@ public void doTask(final Task task) throws TaskExecutionException {
11221190
if (task.getType() == Task.Type.CONNECT) {
11231191
_shell.getBackoffAlgorithm().reset();
11241192
setLink(task.getLink());
1125-
sendStartup(task.getLink());
1193+
sendStartup(task.getLink(), _shell.isConnectionTransfer());
1194+
_shell.setConnectionTransfer(false);
11261195
} else if (task.getType() == Task.Type.DATA) {
11271196
Request request;
11281197
try {
@@ -1141,6 +1210,7 @@ public void doTask(final Task task) throws TaskExecutionException {
11411210
s_logger.error("Error parsing task", e);
11421211
}
11431212
} else if (task.getType() == Task.Type.DISCONNECT) {
1213+
_shell.setConnectionTransfer(false);
11441214
reconnect(task.getLink());
11451215
return;
11461216
} else if (task.getType() == Task.Type.OTHER) {

agent/src/main/java/com/cloud/agent/AgentShell.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ public class AgentShell implements IAgentShell, Daemon {
7676
private String hostToConnect;
7777
private String connectedHost;
7878
private Long preferredHostCheckInterval;
79+
private boolean connectionTransfer = false;
7980
protected AgentProperties agentProperties = new AgentProperties();
8081

8182
public AgentShell() {
@@ -214,6 +215,14 @@ public void setPersistentProperty(String prefix, String name, String value) {
214215
_storage.persist(name, value);
215216
}
216217

218+
public boolean isConnectionTransfer() {
219+
return connectionTransfer;
220+
}
221+
222+
public void setConnectionTransfer(boolean connectionTransfer) {
223+
this.connectionTransfer = connectionTransfer;
224+
}
225+
217226
void loadProperties() throws ConfigurationException {
218227
final File file = PropertiesUtil.findConfigFile("agent.properties");
219228

agent/src/main/java/com/cloud/agent/IAgentShell.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,4 +70,8 @@ public interface IAgentShell {
7070
String getConnectedHost();
7171

7272
void launchNewAgent(ServerResource resource) throws ConfigurationException;
73+
74+
boolean isConnectionTransfer();
75+
76+
void setConnectionTransfer(boolean connectionTransfer);
7377
}

api/src/main/java/com/cloud/host/Host.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@ public static String[] toStrings(Host.Type... types) {
175175
*/
176176
Long getManagementServerId();
177177

178+
Long getLastManagementServerId();
179+
178180
/*
179181
*@return removal date
180182
*/

api/src/main/java/com/cloud/host/Status.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ public static String[] toStrings(Status... states) {
127127
s_fsm.addTransition(Status.Connecting, Event.HostDown, Status.Down);
128128
s_fsm.addTransition(Status.Connecting, Event.Ping, Status.Connecting);
129129
s_fsm.addTransition(Status.Connecting, Event.ManagementServerDown, Status.Disconnected);
130+
s_fsm.addTransition(Status.Connecting, Event.StartAgentRebalance, Status.Rebalancing);
130131
s_fsm.addTransition(Status.Connecting, Event.AgentDisconnected, Status.Alert);
131132
s_fsm.addTransition(Status.Up, Event.PingTimeout, Status.Alert);
132133
s_fsm.addTransition(Status.Up, Event.AgentDisconnected, Status.Alert);

api/src/main/java/com/cloud/resource/ResourceService.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323
import org.apache.cloudstack.api.command.admin.cluster.UpdateClusterCmd;
2424
import org.apache.cloudstack.api.command.admin.host.AddHostCmd;
2525
import org.apache.cloudstack.api.command.admin.host.AddSecondaryStorageCmd;
26-
import org.apache.cloudstack.api.command.admin.host.CancelMaintenanceCmd;
26+
import org.apache.cloudstack.api.command.admin.host.CancelHostMaintenanceCmd;
2727
import org.apache.cloudstack.api.command.admin.host.ReconnectHostCmd;
2828
import org.apache.cloudstack.api.command.admin.host.UpdateHostCmd;
2929
import org.apache.cloudstack.api.command.admin.host.UpdateHostPasswordCmd;
30-
import org.apache.cloudstack.api.command.admin.host.PrepareForMaintenanceCmd;
30+
import org.apache.cloudstack.api.command.admin.host.PrepareForHostMaintenanceCmd;
3131
import org.apache.cloudstack.api.command.admin.host.DeclareHostAsDegradedCmd;
3232
import org.apache.cloudstack.api.command.admin.host.CancelHostAsDegradedCmd;
3333

@@ -51,7 +51,7 @@ public interface ResourceService {
5151

5252
Host autoUpdateHostAllocationState(Long hostId, ResourceState.Event resourceEvent) throws NoTransitionException;
5353

54-
Host cancelMaintenance(CancelMaintenanceCmd cmd);
54+
Host cancelMaintenance(CancelHostMaintenanceCmd cmd);
5555

5656
Host reconnectHost(ReconnectHostCmd cmd) throws AgentUnavailableException;
5757

@@ -69,7 +69,7 @@ public interface ResourceService {
6969

7070
List<? extends Host> discoverHosts(AddSecondaryStorageCmd cmd) throws IllegalArgumentException, DiscoveryException, InvalidParameterValueException;
7171

72-
Host maintain(PrepareForMaintenanceCmd cmd);
72+
Host maintain(PrepareForHostMaintenanceCmd cmd);
7373

7474
Host declareHostAsDegraded(DeclareHostAsDegradedCmd cmd) throws NoTransitionException;
7575

api/src/main/java/com/cloud/server/ManagementServerHostStats.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package com.cloud.server;
2020

2121
import java.util.Date;
22+
import java.util.List;
2223

2324
/**
2425
* management server related stats
@@ -70,6 +71,10 @@ public interface ManagementServerHostStats {
7071

7172
String getOsDistribution();
7273

74+
List<String> getLastAgents();
75+
76+
List<String> getAgents();
77+
7378
int getAgentCount();
7479

7580
long getHeapMemoryUsed();

api/src/main/java/org/apache/cloudstack/api/ApiConstants.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,8 @@ public class ApiConstants {
541541
public static final String PRIVATE_NETWORK_ID = "privatenetworkid";
542542
public static final String ALLOCATION_STATE = "allocationstate";
543543
public static final String MANAGED_STATE = "managedstate";
544+
public static final String MANAGEMENT_SERVER_ID = "managementserverid";
545+
public static final String MANAGEMENT_SERVER_NAME = "managementservername";
544546
public static final String STORAGE_ID = "storageid";
545547
public static final String PING_STORAGE_SERVER_IP = "pingstorageserverip";
546548
public static final String PING_DIR = "pingdir";
@@ -1038,6 +1040,13 @@ public class ApiConstants {
10381040
public static final String LOGOUT = "logout";
10391041
public static final String LIST_IDPS = "listIdps";
10401042

1043+
public static final String MAINTENANCE_INITIATED = "maintenanceinitiated";
1044+
public static final String SHUTDOWN_TRIGGERED = "shutdowntriggered";
1045+
public static final String READY_FOR_SHUTDOWN = "readyforshutdown";
1046+
public static final String PENDING_JOBS_COUNT = "pendingjobscount";
1047+
public static final String AGENTS_COUNT = "agentscount";
1048+
public static final String AGENTS = "agents";
1049+
10411050
public static final String PUBLIC_MTU = "publicmtu";
10421051
public static final String PRIVATE_MTU = "privatemtu";
10431052
public static final String MTU = "mtu";

api/src/main/java/org/apache/cloudstack/api/command/admin/host/CancelMaintenanceCmd.java renamed to api/src/main/java/org/apache/cloudstack/api/command/admin/host/CancelHostMaintenanceCmd.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434

3535
@APICommand(name = "cancelHostMaintenance", description = "Cancels host maintenance.", responseObject = HostResponse.class,
3636
requestHasSensitiveInfo = false, responseHasSensitiveInfo = false)
37-
public class CancelMaintenanceCmd extends BaseAsyncCmd {
38-
public static final Logger s_logger = Logger.getLogger(CancelMaintenanceCmd.class.getName());
37+
public class CancelHostMaintenanceCmd extends BaseAsyncCmd {
38+
public static final Logger s_logger = Logger.getLogger(CancelHostMaintenanceCmd.class.getName());
3939

4040

4141
/////////////////////////////////////////////////////

api/src/main/java/org/apache/cloudstack/api/command/admin/host/ListHostsCmd.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.List;
2222
import java.util.Map;
2323

24+
import org.apache.cloudstack.api.response.ManagementServerResponse;
2425
import org.apache.log4j.Logger;
2526

2627
import org.apache.cloudstack.api.APICommand;
@@ -107,6 +108,9 @@ public class ListHostsCmd extends BaseListCmd {
107108
@Parameter(name = ApiConstants.HYPERVISOR, type = CommandType.STRING, description = "hypervisor type of host: XenServer,KVM,VMware,Hyperv,BareMetal,Simulator")
108109
private String hypervisor;
109110

111+
@Parameter(name = ApiConstants.MANAGEMENT_SERVER_ID, type = CommandType.UUID, entityType = ManagementServerResponse.class, description = "the id of the management server", since="4.18.1")
112+
private Long managementServerId;
113+
110114
/////////////////////////////////////////////////////
111115
/////////////////// Accessors ///////////////////////
112116
/////////////////////////////////////////////////////
@@ -178,7 +182,6 @@ public String getResourceState() {
178182
return resourceState;
179183
}
180184

181-
182185
public Boolean isOutOfBandManagementEnabled() {
183186
return outOfBandManagementEnabled;
184187
}
@@ -187,6 +190,10 @@ public String getHostOutOfBandManagementPowerState() {
187190
return outOfBandManagementPowerState;
188191
}
189192

193+
public Long getManagementServerId() {
194+
return managementServerId;
195+
}
196+
190197
/////////////////////////////////////////////////////
191198
/////////////// API Implementation///////////////////
192199
/////////////////////////////////////////////////////

0 commit comments

Comments
 (0)