Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 116 additions & 105 deletions src/Serval/src/Serval.Translation/Services/UsfmGenerationService.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using SIL.Machine.Corpora;
using SIL.Machine.Corpora;
using SIL.Machine.PunctuationAnalysis;
using SIL.Machine.Translation;
using SIL.Scripture;
Expand All @@ -17,7 +17,7 @@ ContractMapper contractMapper
private readonly IRepository<Build> _builds = builds;
private readonly ContractMapper _contractMapper = contractMapper;
private const string AIDisclaimerRemark =
"This draft of {0} was generated using AI on {1}. It should be reviewed and edited carefully.";
"This draft of {0} was generated by AI from {1} on {2}. It should be reviewed carefully for errors before use. {3}";

public async Task<string> GetUsfmAsync(
string engineId,
Expand Down Expand Up @@ -76,24 +76,61 @@ public async Task<string> GetUsfmAsync(
Build? build = (await _builds.GetAllAsync(b => b.EngineRef == engineId, cancellationToken))
.OrderByDescending(b => b.DateFinished)
.FirstOrDefault();
if (build is null || build.DateFinished is null)
if (build?.DateFinished is null)
throw new InvalidOperationException($"Could not find any completed builds for engine '{engineId}'.");

string disclaimerRemark = string.Format(
CultureInfo.InvariantCulture,
AIDisclaimerRemark,
textId,
build.DateFinished.Value.ToUniversalTime().ToString("u")
);
string markerPlacementRemark = GenerateMarkerPlacementRemark(
paragraphMarkerBehavior,
embedBehavior,
styleMarkerBehavior
);

List<string> remarks = [disclaimerRemark, markerPlacementRemark];
ParallelCorpusContract[] parallelCorpora = [.. _contractMapper.Map(build, engine)];

// Get the versification for the project
CorpusBundle corpusBundle = new(parallelCorpora);
ParallelCorpusContract corpusContract = corpusBundle.ParallelCorpora.Single(c => c.Id == corpusId);
CorpusFileContract sourceFile = corpusContract.SourceCorpora[0].Files[0];
ParatextProjectSettings? sourceSettings = corpusBundle.GetSettings(sourceFile.Location);
ScrVers versification = sourceSettings?.Versification ?? ScrVers.Original;
var scriptureRangeParser = new ScriptureRangeParser(versification);

// Generate remarks for every chapter in the book
List<(int, string)> remarks = [];
List<int>? chapters =
build
.Pretranslate?.SelectMany(p => p.SourceFilters ?? [])
.SelectMany(s =>
scriptureRangeParser
.GetChapters(s.ScriptureRange)
.TryGetValue(textId, out List<int>? filterChapters)
? filterChapters
: []
)
.ToList()
?? [];

// If there are no chapters, we need to set it to null so that the USFM updater
if (chapters.Count == 0)
chapters = null;

ParallelCorpusContract[] parallelCorpora = _contractMapper.Map(build, engine).ToArray();
// Get all the chapters needing remarks
IEnumerable<int> chaptersNeedingRemarks =
chapters ?? Enumerable.Range(1, versification.GetLastChapter(Canon.BookIdToNumber(textId)));

// Add remarks to each chapter
foreach (int chapterNum in chaptersNeedingRemarks)
{
string disclaimerRemark = string.Format(
CultureInfo.InvariantCulture,
AIDisclaimerRemark,
$"{textId} {chapterNum}",
sourceSettings?.Name ?? "Unknown",
build.DateFinished.Value.ToUniversalTime().ToString("u"),
markerPlacementRemark
);
remarks.Add((chapterNum, disclaimerRemark));
}

IReadOnlyList<Pretranslation> pretranslations = await _pretranslations.GetAllAsync(
pt =>
Expand Down Expand Up @@ -126,6 +163,7 @@ public async Task<string> GetUsfmAsync(
corpusId,
textId,
textOrigin == PretranslationUsfmTextOrigin.OnlyExisting ? [] : pretranslations,
chapters,
textBehavior,
Map(paragraphMarkerBehavior),
Map(embedBehavior),
Expand All @@ -146,6 +184,7 @@ public async Task<string> GetUsfmAsync(
corpusId,
textId,
textOrigin == PretranslationUsfmTextOrigin.OnlyExisting ? [] : pretranslations,
chapters,
Map(paragraphMarkerBehavior),
Map(embedBehavior),
Map(styleMarkerBehavior),
Expand All @@ -163,11 +202,12 @@ private static string UpdateSourceUsfm(
string corpusId,
string bookId,
IReadOnlyList<Pretranslation> pretranslations,
IReadOnlyList<int>? chapters,
UpdateUsfmMarkerBehavior paragraphBehavior,
UpdateUsfmMarkerBehavior embedBehavior,
UpdateUsfmMarkerBehavior styleBehavior,
bool placeParagraphMarkers,
IEnumerable<string>? remarks,
IEnumerable<(int, string)> remarks,
string? targetQuoteConvention
)
{
Expand All @@ -176,6 +216,7 @@ private static string UpdateSourceUsfm(
corpusId,
bookId,
pretranslations,
chapters,
UpdateUsfmTextBehavior.StripExisting,
paragraphBehavior,
embedBehavior,
Expand All @@ -192,11 +233,12 @@ private static string UpdateTargetUsfm(
string corpusId,
string bookId,
IReadOnlyList<Pretranslation> pretranslations,
IReadOnlyList<int>? chapters,
UpdateUsfmTextBehavior textBehavior,
UpdateUsfmMarkerBehavior paragraphBehavior,
UpdateUsfmMarkerBehavior embedBehavior,
UpdateUsfmMarkerBehavior styleBehavior,
IEnumerable<string>? remarks,
IEnumerable<(int, string)> remarks,
string? targetQuoteConvention
)
{
Expand All @@ -205,6 +247,7 @@ private static string UpdateTargetUsfm(
corpusId,
bookId,
pretranslations,
chapters,
textBehavior,
paragraphBehavior,
embedBehavior,
Expand All @@ -221,12 +264,13 @@ private static string UpdateUsfm(
string corpusId,
string bookId,
IEnumerable<Pretranslation> pretranslations,
IReadOnlyList<int>? chapters,
UpdateUsfmTextBehavior textBehavior,
UpdateUsfmMarkerBehavior paragraphBehavior,
UpdateUsfmMarkerBehavior embedBehavior,
UpdateUsfmMarkerBehavior styleBehavior,
IEnumerable<IUsfmUpdateBlockHandler>? updateBlockHandlers,
IEnumerable<string>? remarks,
IEnumerable<(int, string)> remarks,
string? targetQuoteConvention,
bool isSource
)
Expand All @@ -244,33 +288,36 @@ bool isSource
string usfm =
updater.UpdateUsfm(
bookId,
pretranslations
.Select(p =>
Map(
p,
isSource,
sourceSettings?.Versification,
targetSettings?.Versification,
paragraphBehavior,
styleBehavior
[
.. pretranslations
.Select(p =>
Map(
p,
isSource,
sourceSettings?.Versification,
targetSettings?.Versification,
paragraphBehavior,
styleBehavior
)
)
)
.Where(row => row.Refs.Any())
.OrderBy(row => row.Refs[0])
.ToArray(),
.Where(row => row.Refs.Any())
.OrderBy(row => row.Refs[0]),
],
chapters: chapters,
fullName: isSource ? sourceSettings?.FullName : targetSettings?.FullName,
textBehavior: textBehavior,
paragraphBehavior: paragraphBehavior,
embedBehavior: embedBehavior,
styleBehavior: styleBehavior,
preserveParagraphStyles: null,
updateBlockHandlers: updateBlockHandlers,
remarks: remarks?.Select(r => (0, r)),
!string.IsNullOrEmpty(targetQuoteConvention) ? null : remarks, // Ensure we only add remarks once
errorHandler: (_) => true,
compareSegments: isSource
) ?? "";

if (!string.IsNullOrEmpty(targetQuoteConvention))
usfm = DenormalizeQuotationMarks(usfm, targetQuoteConvention);
usfm = DenormalizeQuotationMarks(usfm, targetQuoteConvention, remarks);
return usfm;
}

Expand Down Expand Up @@ -351,7 +398,11 @@ pretranslation.Alignment is null
return matrix;
}

private static string DenormalizeQuotationMarks(string usfm, string quoteConvention)
private static string DenormalizeQuotationMarks(
string usfm,
string quoteConvention,
IEnumerable<(int, string)> remarks
)
{
QuoteConvention targetQuoteConvention = QuoteConventions.Standard.GetQuoteConventionByName(quoteConvention);
if (targetQuoteConvention is null)
Expand All @@ -372,79 +423,35 @@ private static string DenormalizeQuotationMarks(string usfm, string quoteConvent
int denormalizableChapterCount = bestChapterStrategies.Count(tup =>
tup.Strategy != QuotationMarkUpdateStrategy.Skip
);
List<string> remarks = [];
string quotationDenormalizationRemark;
if (denormalizableChapterCount == bestChapterStrategies.Count)
{
quotationDenormalizationRemark =
"The quote style in all chapters has been automatically adjusted to match the rest of the project.";
}
else if (denormalizableChapterCount > 0)
{
quotationDenormalizationRemark =
"The quote style in the following chapters has been automatically adjusted to match the rest of the project: "
+ GetChapterRangesString(
bestChapterStrategies
.Where(tuple => tuple.Strategy != QuotationMarkUpdateStrategy.Skip)
.Select(tuple => tuple.ChapterNumber)
.ToList()
)
+ ".";
}
else
const string QuotationDenormalizationRemark =
"Quotation marks have been adjusted automatically to match the rest of the project.";
List<(int Chapter, string Remark)> combinedRemarks = [.. remarks];
for (int i = 1; i <= denormalizableChapterCount; i++)
{
quotationDenormalizationRemark =
"The quote style was not automatically adjusted to match the rest of your project in any chapters.";
int index = combinedRemarks.FindLastIndex(r => r.Chapter == i);
if (index > -1)
{
combinedRemarks[index] = combinedRemarks[index] with
{
Remark = $"{combinedRemarks[index].Remark} {QuotationDenormalizationRemark}",
};
}
else
{
combinedRemarks.Add((i, QuotationDenormalizationRemark));
}
}
remarks.Add(quotationDenormalizationRemark);

var updater = new UpdateUsfmParserHandler(
updateBlockHandlers: [quotationMarkDenormalizer],
remarks: remarks.Select(r => (0, r))
remarks: combinedRemarks
);
UsfmParser.Parse(usfm, updater);

usfm = updater.GetUsfm();
return usfm;
}

internal static string GetChapterRangesString(List<int> chapterNumbers)
{
chapterNumbers = chapterNumbers.Order().ToList();
int start = chapterNumbers[0];
int end = chapterNumbers[0];
List<string> chapterRangeStrings = [];
foreach (int chapterNumber in chapterNumbers[1..])
{
if (chapterNumber == end + 1)
{
end = chapterNumber;
}
else
{
if (start == end)
{
chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture));
}
else
{
chapterRangeStrings.Add($"{start}-{end}");
}
start = chapterNumber;
end = chapterNumber;
}
}
if (start == end)
{
chapterRangeStrings.Add(start.ToString(CultureInfo.InvariantCulture));
}
else
{
chapterRangeStrings.Add($"{start}-{end}");
}
return string.Join(", ", chapterRangeStrings);
}

/// <summary>
/// Generate a natural sounding remark/comment describing marker placement.
/// </summary>
Expand All @@ -456,13 +463,13 @@ internal static string GetChapterRangesString(List<int> chapterNumbers)
/// <para>Remarks are generated in the format:</para>
/// <list type="bullet">
/// <item><description>
/// Paragraph breaks, embed markers, and style markers were moved to the end of the verse.
/// Paragraph breaks, embed markers, and style marker positions were moved to the end of the verse.
/// </description></item>
/// <item><description>
/// Paragraph breaks were moved to the end of the verse. Embed markers have positions preserved. Style markers were removed.
/// Paragraph break positions were moved to the end of the verse. Embed marker positions have been preserved. Style markers were removed.
/// </description></item>
/// <item><description>
/// Paragraph breaks and style markers were moved to the end of the verse. Embed markers were removed.
/// Paragraph break and style marker positions were moved to the end of the verse. Embed markers were removed.
/// </description></item>
/// </list>
/// </remarks>
Expand All @@ -479,25 +486,29 @@ PretranslationUsfmMarkerBehavior styleMarkerBehavior
{ PretranslationUsfmMarkerBehavior.Strip, [] },
};

behaviorMap[paragraphMarkerBehavior].Add("paragraph breaks");
behaviorMap[embedBehavior].Add("embed markers");
behaviorMap[styleMarkerBehavior].Add("style markers");
behaviorMap[paragraphMarkerBehavior].Add("paragraph break");
behaviorMap[embedBehavior].Add("embed marker");
behaviorMap[styleMarkerBehavior].Add("style marker");

IEnumerable<string> sentences = behaviorMap
.Where(kvp => kvp.Value.Count > 0)
.Select(kvp =>
{
string markers =
string markersSingular =
kvp.Value.Count == 1 ? kvp.Value[0] : string.Join(", ", kvp.Value[..^1]) + " and " + kvp.Value[^1];
markers = char.ToUpperInvariant(markers[0]) + markers[1..];
string behavior = kvp.Key switch
string markersPlural =
kvp.Value.Count == 1
? kvp.Value[0] + "s"
: string.Join(", ", kvp.Value[..^1] + "s") + " and " + kvp.Value[^1] + "s";
string sentence = kvp.Key switch
{
PretranslationUsfmMarkerBehavior.Preserve => "were moved to the end of the verse",
PretranslationUsfmMarkerBehavior.PreservePosition => "have positions preserved",
PretranslationUsfmMarkerBehavior.Strip => "were removed",
_ => "have unknown behavior",
PretranslationUsfmMarkerBehavior.Preserve =>
$"{markersSingular} positions were moved to the end of the verse.",
PretranslationUsfmMarkerBehavior.PreservePosition => $"{markersSingular} positions were preserved.",
PretranslationUsfmMarkerBehavior.Strip => $"{markersPlural} were removed.",
_ => $"{markersPlural} have unknown behavior.",
};
return $"{markers} {behavior}.";
return char.ToUpperInvariant(sentence[0]) + sentence[1..];
});

return string.Join(" ", sentences);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2291,10 +2291,9 @@ await _env.Builds.InsertAsync(
usfm.Replace("\r\n", "\n"),
Is.EqualTo(
@"\id MAT - Test1
\rem This draft of MAT was generated using AI on 1970-01-01 00:00:00Z. It should be reviewed and edited carefully.
\rem Paragraph breaks and embed markers were moved to the end of the verse. Style markers were removed.
\h
\c 1
\rem This draft of MAT 1 was generated by AI from Te1 on 1970-01-01 00:00:00Z. It should be reviewed carefully for errors before use. Paragraph break and embed marker positions were moved to the end of the verse. Style markers were removed.
\p
\v 1 translation
\v 2
Expand Down
Loading
Loading