Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions .github/workflows/ci-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,6 @@ jobs:
steps:
- uses: actions/checkout@v6

# get version of machine.py - MACHINE_PY_IMAGE will force the docker compose to use the proper version of machine.py
- name: Install regctl
uses: iarekylew00t/regctl-installer@v4.0.8

- name: Set proper version of Machine.py
run: |
export MACHINE_PY_IMAGE=ghcr.io/sillsdev/machine.py:$(regctl image config ghcr.io/sillsdev/machine.py | jq -r ".config.Labels[\"org.opencontainers.image.version\"]") && \
echo "MACHINE_PY_IMAGE=$MACHINE_PY_IMAGE" >> $GITHUB_ENV && \
echo "MACHINE_PY_CPU_IMAGE=$MACHINE_PY_IMAGE.cpu_only" >> $GITHUB_ENV

- name: Confirm proper version of Machine.py
run: |
echo $MACHINE_PY_IMAGE $MACHINE_PY_CPU_IMAGE

- name: Setup .NET
uses: actions/setup-dotnet@v5
with:
Expand Down
16 changes: 8 additions & 8 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ services:
- "ClearML__AccessKey=${ClearML_AccessKey:?access key needed}"
- "ClearML__SecretKey=${ClearML_SecretKey:?secret key needed}"
- BuildJob__ClearML__0__Queue=${CLEARML_GPU_QUEUE:-lambert_24gb}
- BuildJob__ClearML__0__DockerImage=${MACHINE_PY_IMAGE:-ghcr.io/sillsdev/machine.py:latest}
- BuildJob__ClearML__0__DockerImage=${MACHINE_PY_IMAGE:-ghcr.io/sillsdev/machine.py:1.12.0}
- BuildJob__ClearML__1__Queue=${CLEARML_CPU_QUEUE:-lambert_24gb.cpu_only}
- BuildJob__ClearML__1__DockerImage=${MACHINE_PY_CPU_IMAGE:-ghcr.io/sillsdev/machine.py:latest.cpu_only}
- BuildJob__ClearML__1__DockerImage=${MACHINE_PY_CPU_IMAGE:-ghcr.io/sillsdev/machine.py:1.12.0.cpu_only}
- BuildJob__ClearML__2__Queue=${CLEARML_CPU_QUEUE:-lambert_24gb.cpu_only}
- BuildJob__ClearML__2__DockerImage=${MACHINE_PY_CPU_IMAGE:-ghcr.io/sillsdev/machine.py:latest.cpu_only}
- BuildJob__ClearML__2__DockerImage=${MACHINE_PY_CPU_IMAGE:-ghcr.io/sillsdev/machine.py:1.12.0.cpu_only}
- SharedFile__Uri=s3://silnlp/docker-compose/
- "SharedFile__S3AccessKeyId=${AWS_ACCESS_KEY_ID:?access key needed}"
- "SharedFile__S3SecretAccessKey=${AWS_SECRET_ACCESS_KEY:?secret key needed}"
Expand All @@ -42,7 +42,7 @@ services:
- ~/.nuget/packages:/root/.nuget/packages:ro
- /var/lib/machine:/var/lib/machine
- /var/lib/serval:/var/lib/serval
working_dir: '/app/src/Serval/src/Serval.ApiServer'
working_dir: "/app/src/Serval/src/Serval.ApiServer"
entrypoint:
- dotnet
- run
Expand All @@ -63,7 +63,7 @@ services:
# then hang forever so the container does not exit
command:
[
'/bin/sh',
'-c',
'mongod --quiet --replSet myRS --bind_ip 0.0.0.0 & sleep 2s; mongosh --host localhost:27017 --eval '' config = { "_id" : "myRS", "members" : [{"_id" : 0,"host" : "mongo:27017"}] }; rs.initiate(config, { force: true }); '' ; sleep infinity'
]
"/bin/sh",
"-c",
'mongod --quiet --replSet myRS --bind_ip 0.0.0.0 & sleep 2s; mongosh --host localhost:27017 --eval '' config = { "_id" : "myRS", "members" : [{"_id" : 0,"host" : "mongo:27017"}] }; rs.initiate(config, { force: true }); '' ; sleep infinity',
]
10 changes: 10 additions & 0 deletions src/Echo/src/EchoEngine/TranslationEngineService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@ await _taskQueue.QueueBackgroundWorkItemAsync(

try
{
//Wait for build to exist in the database before starting the build.
TimeSpan timeout = TimeSpan.FromSeconds(60);
DateTime start = DateTime.UtcNow;
while (
(DateTime.UtcNow - start < timeout)
&& !await platform.BuildExistsAsync(buildId, linkedCts.Token)
)
{
await Task.Delay(TimeSpan.FromMilliseconds(10), linkedCts.Token);
}
await platform.BuildStartedAsync(buildId, linkedCts.Token);

int trainCount = 0;
Expand Down
10 changes: 10 additions & 0 deletions src/Echo/src/EchoEngine/WordAlignmentEngineService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ await _taskQueue.QueueBackgroundWorkItemAsync(

try
{
//Wait for build to exist in the database before starting the build.
TimeSpan timeout = TimeSpan.FromSeconds(60);
DateTime start = DateTime.UtcNow;
while (
(DateTime.UtcNow - start < timeout)
&& !await platform.BuildExistsAsync(buildId, linkedCts.Token)
)
{
await Task.Delay(TimeSpan.FromMilliseconds(10), linkedCts.Token);
}
await platform.BuildStartedAsync(buildId, linkedCts.Token);

int trainCount = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,9 @@ private static IServalConfigurator AddTranslationEngines(this IServalConfigurato
configurator.Services.AddSingleton<ITransferEngineFactory, TransferEngineFactory>();
configurator.Services.AddSingleton<ITruecaserFactory, UnigramTruecaserFactory>();
configurator.AddTranslationEngine<SmtTransferEngineService>(EngineType.SmtTransfer.ToString());
configurator.JobQueues.Add(BuildJobQueues.SmtTransfer);

// NMT Engine
configurator.AddTranslationEngine<NmtEngineService>(EngineType.Nmt.ToString());
configurator.JobQueues.Add(BuildJobQueues.Nmt);

return configurator;
}
Expand All @@ -84,7 +82,6 @@ private static IServalConfigurator AddWordAlignmentEngines(this IServalConfigura
configurator.Services.AddSingleton<IWordAlignmentModelFactory, ThotWordAlignmentModelFactory>();
configurator.AddWordAlignmentEngine<StatisticalEngineService>(EngineType.Statistical.ToString());
configurator.Services.AddHostedService<StatisticalEngineCommitService>();
configurator.JobQueues.Add(BuildJobQueues.Statistical);

return configurator;
}
Expand Down Expand Up @@ -206,10 +203,12 @@ private static IServalConfigurator AddBuildJobService(this IServalConfigurator c
configurator.Services.AddSingleton<IClearMLQueueService>(x => x.GetRequiredService<ClearMLMonitorService>());
configurator.Services.AddHostedService(p => p.GetRequiredService<ClearMLMonitorService>());

configurator.Services.AddScoped<IBuildJobRunner, HangfireBuildJobRunner>();
configurator.Services.AddScoped<IHangfireBuildJobFactory, NmtHangfireBuildJobFactory>();
configurator.Services.AddScoped<IHangfireBuildJobFactory, SmtTransferHangfireBuildJobFactory>();
configurator.Services.AddScoped<IHangfireBuildJobFactory, StatisticalHangfireBuildJobFactory>();
configurator.Services.AddSingleton<LocalBuildJobRunner>();
configurator.Services.AddSingleton<IBuildJobRunner>(sp => sp.GetRequiredService<LocalBuildJobRunner>());
configurator.Services.AddHostedService(sp => sp.GetRequiredService<LocalBuildJobRunner>());
configurator.Services.AddSingleton<ILocalBuildJobFactory, NmtLocalBuildJobFactory>();
configurator.Services.AddSingleton<ILocalBuildJobFactory, SmtTransferLocalBuildJobFactory>();
configurator.Services.AddSingleton<ILocalBuildJobFactory, StatisticalLocalBuildJobFactory>();

var smtTransferEngineOptions = new SmtTransferEngineOptions();
configurator.Configuration.GetSection(SmtTransferEngineOptions.Key).Bind(smtTransferEngineOptions);
Expand Down
4 changes: 3 additions & 1 deletion src/Machine/src/Serval.Machine.Shared/Models/Build.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ public enum BuildJobState

public enum BuildJobRunnerType
{
Hangfire,
ClearML,
Local,
}

public enum BuildStage
Expand All @@ -28,6 +28,8 @@ public record Build
public required string JobId { get; init; }
public required BuildJobRunnerType BuildJobRunner { get; init; }
public required BuildStage Stage { get; init; }
public DateTimeOffset QueuedAt { get; init; }
public string? Options { get; set; }
public string? JobData { get; init; }
public required BuildExecutionData ExecutionData { get; init; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,11 @@
<PackageReference Include="Bugsnag" Version="4.1.0" />
<PackageReference Include="Bugsnag.AspNet.Core" Version="4.1.0" />
<PackageReference Include="CommunityToolkit.HighPerformance" Version="8.4.2" />
<PackageReference Include="Hangfire.Core" Version="1.8.23" />
<PackageReference Include="Microsoft.AspNetCore.Mvc.NewtonsoftJson" Version="10.0.5" />
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="10.0.5" />
<PackageReference Include="SIL.Machine" Version="3.8.2" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine\SIL.Machine.csproj')" />
<PackageReference Include="SIL.Machine.Morphology.HermitCrab" Version="3.8.2" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Morphology.HermitCrab\SIL.Machine.Morphology.HermitCrab.csproj')" />
<PackageReference Include="SIL.Machine.Translation.Thot" Version="3.8.2" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Translation.Thot\SIL.Machine.Translation.Thot.csproj')" />
<PackageReference Include="SIL.Machine" Version="3.8.3" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine\SIL.Machine.csproj')" />
<PackageReference Include="SIL.Machine.Morphology.HermitCrab" Version="3.8.3" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Morphology.HermitCrab\SIL.Machine.Morphology.HermitCrab.csproj')" />
<PackageReference Include="SIL.Machine.Translation.Thot" Version="3.8.3" Condition="!Exists('..\..\..\..\..\machine\src\SIL.Machine.Translation.Thot\SIL.Machine.Translation.Thot.csproj')" />
<PackageReference Include="SIL.WritingSystems" Version="17.0.0" />
<PackageReference Include="System.Linq.Async" Version="7.0.0" />
<PackageReference Include="YamlDotNet" Version="16.3.0" />
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
namespace Serval.Machine.Shared.Services;
namespace Serval.Machine.Shared.Services;

public abstract class HangfireBuildJob<TEngine>(
public abstract class BuildJob<TEngine>(
IPlatformService platformService,
IRepository<TEngine> engines,
IDataAccessContext dataAccessContext,
IBuildJobService<TEngine> buildJobService,
ILogger<HangfireBuildJob<TEngine>> logger
) : HangfireBuildJob<TEngine, object?>(platformService, engines, dataAccessContext, buildJobService, logger)
ILogger<BuildJob<TEngine>> logger
) : BuildJob<TEngine, object?>(platformService, engines, dataAccessContext, buildJobService, logger)
where TEngine : ITrainingEngine
{
[AutomaticRetry(Attempts = 0)]
public virtual Task RunAsync(
string engineId,
string buildId,
Expand All @@ -21,22 +20,21 @@ CancellationToken cancellationToken
}
}

public abstract class HangfireBuildJob<TEngine, TData>(
public abstract class BuildJob<TEngine, TData>(
IPlatformService platformService,
IRepository<TEngine> engines,
IDataAccessContext dataAccessContext,
IBuildJobService<TEngine> buildJobService,
ILogger<HangfireBuildJob<TEngine, TData>> logger
ILogger<BuildJob<TEngine, TData>> logger
)
where TEngine : ITrainingEngine
{
protected IPlatformService PlatformService { get; } = platformService;
protected IRepository<TEngine> Engines { get; } = engines;
protected IDataAccessContext DataAccessContext { get; } = dataAccessContext;
protected IBuildJobService<TEngine> BuildJobService { get; } = buildJobService;
protected ILogger<HangfireBuildJob<TEngine, TData>> Logger { get; } = logger;
protected ILogger<BuildJob<TEngine, TData>> Logger { get; } = logger;

[AutomaticRetry(Attempts = 0)]
public virtual async Task RunAsync(
string engineId,
string buildId,
Expand All @@ -59,8 +57,7 @@ CancellationToken cancellationToken
}
catch (OperationCanceledException e)
{
// Log the full exception for debugging purposes
Logger.LogInformation(e, "Build Hangfire job canceled ({0})", buildId);
Logger.LogInformation(e, "Build job canceled ({0})", buildId);

// Check if the cancellation was initiated by an API call or a shutdown.
TEngine? engine = await Engines.GetAsync(
Expand All @@ -87,8 +84,7 @@ await BuildJobService.BuildJobFinishedAsync(
}
else if (engine is not null)
{
// the build was canceled, because of a server shutdown
// switch state back to pending
// the build was canceled because of a server shutdown — switch state back to pending
completionStatus = JobCompletionStatus.Restarting;
await DataAccessContext.WithTransactionAsync(
async (ct) =>
Expand Down Expand Up @@ -128,7 +124,7 @@ await BuildJobService.BuildJobFinishedAsync(
}
finally
{
await CleanupAsync(engineId, buildId, data, completionStatus);
await CleanupAsync(engineId, buildId, completionStatus);
}
}

Expand All @@ -150,12 +146,7 @@ protected abstract Task DoWorkAsync(
CancellationToken cancellationToken
);

protected virtual Task CleanupAsync(
string engineId,
string buildId,
TData data,
JobCompletionStatus completionStatus
)
protected virtual Task CleanupAsync(string engineId, string buildId, JobCompletionStatus completionStatus)
{
return Task.CompletedTask;
}
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public async Task<bool> StartBuildJobAsync(
)
{
IBuildJobRunner runner = Runners[runnerType];
string jobId = await runner.CreateJobAsync(
(string jobId, string? jobData) = await runner.CreateJobAsync(
engineType,
engineId,
buildId,
Expand Down Expand Up @@ -102,7 +102,9 @@ public async Task<bool> StartBuildJobAsync(
BuildJobRunner = runner.Type,
Stage = stage,
JobState = BuildJobState.Pending,
QueuedAt = DateTimeOffset.UtcNow,
Options = buildOptions,
JobData = jobData,
ExecutionData = new BuildExecutionData(),
}
),
Expand Down
25 changes: 0 additions & 25 deletions src/Machine/src/Serval.Machine.Shared/Services/BuildProgress.cs

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public async Task DeleteEngineAsync(string engineId, CancellationToken cancellat
await _clearMLService.DeleteProjectAsync(projectId, cancellationToken);
}

public async Task<string> CreateJobAsync(
public async Task<(string JobId, string? JobData)> CreateJobAsync(
EngineType engineType,
string engineId,
string buildId,
Expand All @@ -47,25 +47,25 @@ public async Task<string> CreateJobAsync(

ClearMLTask? task = await _clearMLService.GetTaskByNameAsync(buildId, cancellationToken);
if (task is not null)
return task.Id;
return (task.Id, null);

IClearMLBuildJobFactory buildJobFactory = _buildJobFactories[engineType];
string script = await buildJobFactory.CreateJobScriptAsync(
engineId,
buildId,
_options[engineType].ModelType,
stage,
data,
buildOptions,
cancellationToken
);
return await _clearMLService.CreateTaskAsync(
string jobId = await _clearMLService.CreateTaskAsync(
buildId,
projectId,
script,
_options[engineType].DockerImage,
cancellationToken
);
return (jobId, null);
}

public Task<bool> DeleteJobAsync(string jobId, CancellationToken cancellationToken = default)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ CancellationToken cancellationToken
try
{
return await buildJobService.StartBuildJobAsync(
BuildJobRunnerType.Hangfire,
BuildJobRunnerType.Local,
engineType,
engineId,
buildId,
Expand Down
Loading
Loading