Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
cf4b305
Support for Management Server Maintenance
sureshanaparti Oct 24, 2024
ef86e96
fixes & code improvements
sureshanaparti Nov 5, 2024
1acb52f
agent reconnect fixes, consider avoid list
sureshanaparti Nov 5, 2024
1c22384
ui fixes
sureshanaparti Nov 5, 2024
18877b3
ui confirmation text fix
sureshanaparti Nov 6, 2024
b785508
direct agents transfer and pending jobs timer task fixes
sureshanaparti Nov 14, 2024
74e9a73
close unclosed socket channels if any
sureshanaparti Nov 14, 2024
d071cbe
Updated pending jobs check timer task with ScheduledExecutorService
sureshanaparti Nov 15, 2024
6494b42
fixes
sureshanaparti Nov 15, 2024
c16da02
keep maintenance state on trigger shutdown call when ms is in mainten…
sureshanaparti Nov 27, 2024
4c609b7
direct agent transfer fixes
sureshanaparti Nov 29, 2024
bf2c80a
add pending jobs count to ms response
sureshanaparti Dec 2, 2024
dc0b6f9
during ms heartbeat, update state to up only when it's down
sureshanaparti Dec 3, 2024
1acc64c
allow vm work jobs of async job created before prepare for maintenance
sureshanaparti Dec 5, 2024
057758e
upgrade path changes
sureshanaparti Dec 5, 2024
065fdd0
Revert "keep maintenance state on trigger shutdown call when ms is in…
sureshanaparti Dec 10, 2024
c23ddd1
rebase fix
sureshanaparti Dec 10, 2024
165bff9
upgrade path changes
sureshanaparti Jan 10, 2025
e5a5251
rebase fixes
sureshanaparti Jan 10, 2025
d60f8d1
skip maintenance test when multiple management servers are not availa…
sureshanaparti Jan 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 84 additions & 14 deletions agent/src/main/java/com/cloud/agent/Agent.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.nio.channels.ClosedChannelException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand All @@ -40,6 +41,8 @@

import javax.naming.ConfigurationException;

import com.cloud.agent.api.MigrateAgentConnectionAnswer;
import com.cloud.agent.api.MigrateAgentConnectionCommand;
import com.cloud.resource.AgentStatusUpdater;
import com.cloud.resource.ResourceStatusUpdater;
import com.cloud.agent.api.PingAnswer;
Expand Down Expand Up @@ -313,7 +316,6 @@ public void start() {
}
_shell.updateConnectedHost();
scavengeOldAgentObjects();

}

public void stop(final String reason, final String detail) {
Expand Down Expand Up @@ -477,13 +479,18 @@ public synchronized void lockStartupTask(final Link link) {
}

public void sendStartup(final Link link) {
sendStartup(link, false);
}

public void sendStartup(final Link link, boolean transfer) {
final StartupCommand[] startup = _resource.initialize();
if (startup != null) {
final String msHostList = _shell.getPersistentProperty(null, "host");
final Command[] commands = new Command[startup.length];
for (int i = 0; i < startup.length; i++) {
setupStartupCommand(startup[i]);
startup[i].setMSHostList(msHostList);
startup[i].setConnectionTransferred(transfer);
commands[i] = startup[i];
}
final Request request = new Request(_id != null ? _id : -1, -1, commands, false, false);
Expand Down Expand Up @@ -541,9 +548,14 @@ public Task create(final Task.Type type, final Link link, final byte[] data) {
}

protected void reconnect(final Link link) {
if (!_reconnectAllowed) {
reconnect(link, null, null, false);
}

protected void reconnect(final Link link, String preferredHost, List<String> avoidHostList, boolean forTransfer) {
if (!(forTransfer || _reconnectAllowed)) {
return;
}

synchronized (this) {
if (_startup != null) {
_startup.cancel();
Expand Down Expand Up @@ -575,22 +587,29 @@ protected void reconnect(final Link link) {
_shell.getBackoffAlgorithm().waitBeforeRetry();
}

String host = preferredHost;
if (StringUtils.isEmpty(host)) {
host = _shell.getNextHost();
}

do {
final String host = _shell.getNextHost();
_connection = new NioClient("Agent", host, _shell.getPort(), _shell.getWorkers(), this);
logger.info("Reconnecting to host:{}", host);
try {
_connection.start();
} catch (final NioConnectionException e) {
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
_connection.stop();
if (CollectionUtils.isEmpty(avoidHostList) || !avoidHostList.contains(host)) {
_connection = new NioClient("Agent", host, _shell.getPort(), _shell.getWorkers(), this);
logger.info("Reconnecting to host:{}", host);
try {
_connection.cleanUp();
} catch (final IOException ex) {
logger.warn("Fail to clean up old connection. {}", ex);
_connection.start();
} catch (final NioConnectionException e) {
logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e);
_connection.stop();
try {
_connection.cleanUp();
} catch (final IOException ex) {
logger.warn("Fail to clean up old connection. {}", ex);
}
}
}
_shell.getBackoffAlgorithm().waitBeforeRetry();
host = _shell.getNextHost();
} while (!_connection.isStartup());
_shell.updateConnectedHost();
logger.info("Connected to the host: {}", _shell.getConnectedHost());
Expand Down Expand Up @@ -703,6 +722,8 @@ protected void processRequest(final Request request, final Link link) {
}
} else if (cmd instanceof SetupMSListCommand) {
answer = setupManagementServerList((SetupMSListCommand) cmd);
} else if (cmd instanceof MigrateAgentConnectionCommand) {
answer = migrateAgentToOtherMS((MigrateAgentConnectionCommand) cmd);
} else {
if (cmd instanceof ReadyCommand) {
processReadyCommand(cmd);
Expand Down Expand Up @@ -858,6 +879,53 @@ private Answer setupManagementServerList(final SetupMSListCommand cmd) {
return new SetupMSListAnswer(true);
}

private Answer migrateAgentToOtherMS(final MigrateAgentConnectionCommand cmd) {
try {
if (CollectionUtils.isNotEmpty(cmd.getMsList())) {
processManagementServerList(cmd.getMsList(), cmd.getLbAlgorithm(), cmd.getLbCheckInterval());
}
migrateAgentConnection(cmd.getAvoidMsList());
} catch (Exception e) {
String errMsg = "Migrate agent connection failed, due to " + e.getMessage();
logger.debug(errMsg, e);
return new MigrateAgentConnectionAnswer(errMsg);
}
return new MigrateAgentConnectionAnswer(true);
}

private void migrateAgentConnection(List<String> avoidMsList) {
final String[] msHosts = _shell.getHosts();
if (msHosts == null || msHosts.length < 1) {
throw new CloudRuntimeException("Management Server hosts empty, not properly configured in agent");
}

List<String> msHostsList = new ArrayList<>(Arrays.asList(msHosts));
msHostsList.removeAll(avoidMsList);
if (msHostsList.isEmpty() || StringUtils.isEmpty(msHostsList.get(0))) {
throw new CloudRuntimeException("No other Management Server hosts to migrate");
}

String preferredHost = null;
for (String msHost : msHostsList) {
try (final Socket socket = new Socket()) {
socket.connect(new InetSocketAddress(msHost, _shell.getPort()), 5000);
preferredHost = msHost;
break;
} catch (final IOException e) {
throw new CloudRuntimeException("Management server host: " + msHost + " is not reachable, to migrate connection");
}
}

if (preferredHost == null) {
throw new CloudRuntimeException("Management server host(s) are not reachable, to migrate connection");
}

logger.debug("Management server host " + preferredHost + " is found to be reachable, trying to reconnect");
_shell.resetHostCounter();
_shell.setConnectionTransfer(true);
reconnect(_link, preferredHost, avoidMsList, true);
}

public void processResponse(final Response response, final Link link) {
final Answer answer = response.getAnswer();
logger.debug("Received response: {}", response.toString());
Expand Down Expand Up @@ -1153,7 +1221,8 @@ public void doTask(final Task task) throws TaskExecutionException {
if (task.getType() == Task.Type.CONNECT) {
_shell.getBackoffAlgorithm().reset();
setLink(task.getLink());
sendStartup(task.getLink());
sendStartup(task.getLink(), _shell.isConnectionTransfer());
_shell.setConnectionTransfer(false);
} else if (task.getType() == Task.Type.DATA) {
Request request;
try {
Expand All @@ -1178,6 +1247,7 @@ public void doTask(final Task task) throws TaskExecutionException {
Thread.sleep(5000);
} catch (InterruptedException e) {
}
_shell.setConnectionTransfer(false);
reconnect(task.getLink());
return;
} else if (task.getType() == Task.Type.OTHER) {
Expand Down
9 changes: 9 additions & 0 deletions agent/src/main/java/com/cloud/agent/AgentShell.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ public class AgentShell implements IAgentShell, Daemon {
private String hostToConnect;
private String connectedHost;
private Long preferredHostCheckInterval;
private boolean connectionTransfer = false;
protected AgentProperties agentProperties = new AgentProperties();

public AgentShell() {
Expand Down Expand Up @@ -215,6 +216,14 @@ public void setPersistentProperty(String prefix, String name, String value) {
_storage.persist(name, value);
}

public boolean isConnectionTransfer() {
return connectionTransfer;
}

public void setConnectionTransfer(boolean connectionTransfer) {
this.connectionTransfer = connectionTransfer;
}

void loadProperties() throws ConfigurationException {
final File file = PropertiesUtil.findConfigFile("agent.properties");

Expand Down
4 changes: 4 additions & 0 deletions agent/src/main/java/com/cloud/agent/IAgentShell.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,8 @@ public interface IAgentShell {
String getConnectedHost();

void launchNewAgent(ServerResource resource) throws ConfigurationException;

boolean isConnectionTransfer();

void setConnectionTransfer(boolean connectionTransfer);
}
2 changes: 2 additions & 0 deletions api/src/main/java/com/cloud/host/Host.java
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ public static String[] toStrings(Host.Type... types) {
*/
Long getManagementServerId();

Long getLastManagementServerId();

/*
*@return removal date
*/
Expand Down
1 change: 1 addition & 0 deletions api/src/main/java/com/cloud/host/Status.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ public static String[] toStrings(Status... states) {
s_fsm.addTransition(Status.Connecting, Event.HostDown, Status.Down);
s_fsm.addTransition(Status.Connecting, Event.Ping, Status.Connecting);
s_fsm.addTransition(Status.Connecting, Event.ManagementServerDown, Status.Disconnected);
s_fsm.addTransition(Status.Connecting, Event.StartAgentRebalance, Status.Rebalancing);
s_fsm.addTransition(Status.Connecting, Event.AgentDisconnected, Status.Alert);
s_fsm.addTransition(Status.Up, Event.PingTimeout, Status.Alert);
s_fsm.addTransition(Status.Up, Event.AgentDisconnected, Status.Alert);
Expand Down
8 changes: 4 additions & 4 deletions api/src/main/java/com/cloud/resource/ResourceService.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@
import org.apache.cloudstack.api.command.admin.cluster.UpdateClusterCmd;
import org.apache.cloudstack.api.command.admin.host.AddHostCmd;
import org.apache.cloudstack.api.command.admin.host.AddSecondaryStorageCmd;
import org.apache.cloudstack.api.command.admin.host.CancelMaintenanceCmd;
import org.apache.cloudstack.api.command.admin.host.CancelHostMaintenanceCmd;
import org.apache.cloudstack.api.command.admin.host.ReconnectHostCmd;
import org.apache.cloudstack.api.command.admin.host.UpdateHostCmd;
import org.apache.cloudstack.api.command.admin.host.UpdateHostPasswordCmd;
import org.apache.cloudstack.api.command.admin.host.PrepareForMaintenanceCmd;
import org.apache.cloudstack.api.command.admin.host.PrepareForHostMaintenanceCmd;
import org.apache.cloudstack.api.command.admin.host.DeclareHostAsDegradedCmd;
import org.apache.cloudstack.api.command.admin.host.CancelHostAsDegradedCmd;

Expand All @@ -51,7 +51,7 @@ public interface ResourceService {

Host autoUpdateHostAllocationState(Long hostId, ResourceState.Event resourceEvent) throws NoTransitionException;

Host cancelMaintenance(CancelMaintenanceCmd cmd);
Host cancelMaintenance(CancelHostMaintenanceCmd cmd);

Host reconnectHost(ReconnectHostCmd cmd) throws AgentUnavailableException;

Expand All @@ -69,7 +69,7 @@ public interface ResourceService {

List<? extends Host> discoverHosts(AddSecondaryStorageCmd cmd) throws IllegalArgumentException, DiscoveryException, InvalidParameterValueException;

Host maintain(PrepareForMaintenanceCmd cmd);
Host maintain(PrepareForHostMaintenanceCmd cmd);

Host declareHostAsDegraded(DeclareHostAsDegradedCmd cmd) throws NoTransitionException;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package com.cloud.server;

import java.util.Date;
import java.util.List;

/**
* management server related stats
Expand Down Expand Up @@ -70,6 +71,10 @@ public interface ManagementServerHostStats {

String getOsDistribution();

List<String> getLastAgents();

List<String> getAgents();

int getAgentCount();

long getHeapMemoryUsed();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1136,9 +1136,12 @@ public class ApiConstants {
public static final String LOGOUT = "logout";
public static final String LIST_IDPS = "listIdps";

public static final String READY_FOR_SHUTDOWN = "readyforshutdown";
public static final String MAINTENANCE_INITIATED = "maintenanceinitiated";
public static final String SHUTDOWN_TRIGGERED = "shutdowntriggered";
public static final String READY_FOR_SHUTDOWN = "readyforshutdown";
public static final String PENDING_JOBS_COUNT = "pendingjobscount";
public static final String AGENTS_COUNT = "agentscount";
public static final String AGENTS = "agents";

public static final String PUBLIC_MTU = "publicmtu";
public static final String PRIVATE_MTU = "privatemtu";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

@APICommand(name = "cancelHostMaintenance", description = "Cancels host maintenance.", responseObject = HostResponse.class,
requestHasSensitiveInfo = false, responseHasSensitiveInfo = false)
public class CancelMaintenanceCmd extends BaseAsyncCmd {
public class CancelHostMaintenanceCmd extends BaseAsyncCmd {


/////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.cloudstack.api.response.ClusterResponse;
import org.apache.cloudstack.api.response.HostResponse;
import org.apache.cloudstack.api.response.ListResponse;
import org.apache.cloudstack.api.response.ManagementServerResponse;
import org.apache.cloudstack.api.response.PodResponse;
import org.apache.cloudstack.api.response.UserVmResponse;
import org.apache.cloudstack.api.response.ZoneResponse;
Expand Down Expand Up @@ -105,6 +106,9 @@ public class ListHostsCmd extends BaseListCmd {
@Parameter(name = ApiConstants.HYPERVISOR, type = CommandType.STRING, description = "hypervisor type of host: XenServer,KVM,VMware,Hyperv,BareMetal,Simulator")
private String hypervisor;

@Parameter(name = ApiConstants.MANAGEMENT_SERVER_ID, type = CommandType.UUID, entityType = ManagementServerResponse.class, description = "the id of the management server", since="4.21.0")
private Long managementServerId;

/////////////////////////////////////////////////////
/////////////////// Accessors ///////////////////////
/////////////////////////////////////////////////////
Expand Down Expand Up @@ -189,6 +193,10 @@ public String getHostOutOfBandManagementPowerState() {
return outOfBandManagementPowerState;
}

public Long getManagementServerId() {
return managementServerId;
}

/////////////////////////////////////////////////////
/////////////// API Implementation///////////////////
/////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

@APICommand(name = "prepareHostForMaintenance", description = "Prepares a host for maintenance.", responseObject = HostResponse.class,
requestHasSensitiveInfo = false, responseHasSensitiveInfo = false)
public class PrepareForMaintenanceCmd extends BaseAsyncCmd {
public class PrepareForHostMaintenanceCmd extends BaseAsyncCmd {


/////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,13 @@ public class AsyncJobResponse extends BaseResponse {
@Param(description = "the unique ID of the instance/entity object related to the job")
private String jobInstanceId;

@SerializedName("managementserverid")
@SerializedName(ApiConstants.MANAGEMENT_SERVER_ID)
@Param(description = "the msid of the management server on which the job is running", since = "4.19")
private Long msid;
private String managementServerId;

@SerializedName(ApiConstants.MANAGEMENT_SERVER_NAME)
@Param(description = "the management server name of the host", since = "4.21.0")
private String managementServerName;

@SerializedName(ApiConstants.CREATED)
@Param(description = " the created date of the job")
Expand Down Expand Up @@ -156,7 +160,11 @@ public void setRemoved(final Date removed) {
this.removed = removed;
}

public void setMsid(Long msid) {
this.msid = msid;
public void setManagementServerId(String managementServerId) {
this.managementServerId = managementServerId;
}

public void setManagementServerName(String managementServerName) {
this.managementServerName = managementServerName;
}
}
Loading