From cb4f6dfd17e2a98cba7de2dbd1c4776a81d52887 Mon Sep 17 00:00:00 2001
From: Peter Polak <polak@ufal.mff.cuni.cz>
Date: Mon, 2 Mar 2026 00:22:05 +0100
Subject: [PATCH 1/2] Implement OmniSTEval evaluation

---
 README.md                               |  32 +++-
 pyproject.toml                          |   4 +-
 simulstream/metrics/score_omnisteval.py | 218 ++++++++++++++++++++++++
 3 files changed, 248 insertions(+), 6 deletions(-)
 create mode 100644 simulstream/metrics/score_omnisteval.py

diff --git a/README.md b/README.md
index 27aaf41..8b2d94f 100644
--- a/README.md
+++ b/README.md
@@ -167,27 +167,27 @@ option can be followed by running the dedicated command:
 ```
 simulstream_inference --speech-processor-config config/speech_processor.yaml \
     --wav-list-file PATH_TO_TXT_FILE_WITH_A_LIST_OF_WAV_FILES.txt \
-    --tgt-lang it --src-lang en --metrics-log-file metrics.jsonl
+    --tgt-lang it --src-lang en --metrics-log-file inference_log_file.jsonl
 ```
 
-Once you have generated the JSONL file containing the inference log (e.g. ``metrics.jsonl``), you
+Once you have generated the JSONL file containing the inference log (e.g. ``inference_log_file.jsonl``), you
 can score your speech processor by running:
 
 ```shell
 simulstream_score_latency --scorer stream_laal \
     --eval-config config/speech_processor.yaml \
-    --log-file metrics.jsonl \
+    --log-file inference_log_file.jsonl \
     --reference REFERENCE_FILE.txt \
     --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 
 simulstream_score_quality --scorer comet \
     --eval-config config/speech_processor.yaml \
-    --log-file metrics.jsonl \
+    --log-file inference_log_file.jsonl \
     --references REFERENCES_FILE.txt \
     --transcripts TRANSCRIPTS_FILE.txt
 
 simulstream_stats --eval-config config/speech_processor.yaml \
-    --log-file metrics.jsonl
+    --log-file inference_log_file.jsonl
 ```
 
 Each of them will output different metrics. ``simulstream_score_latency`` provides the metric for
@@ -202,6 +202,28 @@ requiring them) file(s).
 
 Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.
 
+#### Combined Quality and Latency Evaluation
+
+For a combined quality and latency evaluation you can run a single command that computes quality
+metrics (BLEU, chrF, COMET) and latency metrics (LongYAAL, LongLAAL, LongAL, LongDAL, LongAP)
+using the [OmniSTEval toolkit](https://github.com/pe-trik/OmniSTEval) on the backend. 
+
+Example:
+
+```shell
+simulstream_run_omnisteval \
+    --eval-config config/speech_processor.yaml \
+    --log-file inference_log_file.jsonl \
+    --audio-definition audio_definition.yaml \
+    --reference REFERENCES_FILE.txt \
+    --latency-unit word \
+    --output-folder evaluation_results
+```
+
+The `audio-definition.yaml` file must map each audio file to its `offset`, `duration`.
+Results are written to the specified output folder (e.g. `evaluation_results`) as TSV metric files
+plus a human-readable TXT report.
+
 
 ## Contributing
 
diff --git a/pyproject.toml b/pyproject.toml
index 45a6581..1688bae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ simulstream_inference = "simulstream.inference:cli_main"
 simulstream_score_quality = "simulstream.metrics.score_quality:cli_main"
 simulstream_score_latency = "simulstream.metrics.score_latency:cli_main"
 simulstream_stats = "simulstream.metrics.stats:cli_main"
+simulstream_run_omnisteval = "simulstream.metrics.score_omnisteval:cli_main"
 
 [project.optional-dependencies]
 dev = [
@@ -62,7 +63,8 @@ vad = [
 eval = [
     "unbabel-comet==2.2.6",
     "mweralign",
-    "sacrebleu"
+    "sacrebleu",
+    "omnisteval>=0.1.3",
 ]
 
 [tool.setuptools.dynamic]
diff --git a/simulstream/metrics/score_omnisteval.py b/simulstream/metrics/score_omnisteval.py
new file mode 100644
index 0000000..fa34810
--- /dev/null
+++ b/simulstream/metrics/score_omnisteval.py
@@ -0,0 +1,218 @@
+import argparse
+import logging
+import os
+from typing import Optional
+
+import omnisteval
+import sacrebleu
+
+import simulstream
+from omnisteval.io import load_resegmentation_inputs as load_inputs, dump_instances_jsonl, dump_scores_tsv, format_report
+from omnisteval import resegment
+from omnisteval import evaluate_instances
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
+    force=True,
+)
+LOGGER = logging.getLogger("simulstream.score_omnisteval")
+
+
+def _build_settings(
+    source_sentences_file: str,
+    audio_definition: str,
+    reference: str,
+    log_file: str,
+    latency_unit: str,
+    eval_config: str,
+    moses_tokenizer_lang: Optional[str],
+    bleu_tokenizer: str,
+    comet: bool,
+    comet_model: str,):
+    """
+    Build a dictionary of settings to log alongside the scores.
+    This is not used for the actual scoring, but can be helpful for record-keeping and debugging.
+    Keys match OmniSTEval settings where applicable.
+    """
+
+    return {
+        "Hypothesis": log_file,
+        "Hypothesis format": "simulstream",
+        "Reference": reference,
+        "Source sentences": source_sentences_file or "none",
+        "Eval config": eval_config,
+        "Segmentation": audio_definition,
+        "Seg. type": "speech",
+        "Language": moses_tokenizer_lang or "none",
+        "BLEU tokenizer": bleu_tokenizer,
+        "Char-level": "yes" if latency_unit == "char" else "no",
+        "Offset delays": "no",
+        "Fix CA emissions": "no",
+        "COMET model": comet_model if comet else "none",
+        "OmniSTEval version": omnisteval.__version__,
+        "Simulstream version": simulstream.__version__,
+    }
+
+
+def main(
+    source_sentences_file: str,
+    audio_definition: str,
+    reference: str,
+    log_file: str,
+    latency_unit: str,
+    eval_config: str,
+    output_folder: str,
+    moses_tokenizer_lang: Optional[str],
+    bleu_tokenizer: str,
+    comet: bool,
+    comet_model: str,
+):
+    LOGGER.info(f"Simulstream version: {simulstream.__version__}")
+
+    LOGGER.info("Loading evaluation config and log file...")
+
+    source_sentences = None
+    if source_sentences_file is not None:
+        with open(source_sentences_file, "r", encoding="utf-8") as f:
+            source_sentences = [line.strip() for line in f]
+
+    LOGGER.info("Loading resegmentation inputs for OmniSTEval...")
+    ref_words, hyp_words, segmentation, ref_sentences, all_have_emission_ca = (
+        load_inputs(
+            audio_definition,
+            None,
+            reference,
+            log_file,
+            hypothesis_format="simulstream",
+            char_level=(latency_unit == "char"),
+            offset_delays=False,
+            fix_emission_ca_flag=False,
+            simulstream_config_file=eval_config,
+        )
+    )
+
+    # suppress mosestokenizer INFO logs which are very verbose
+    logging.getLogger("mosestokenizer").setLevel(logging.WARNING)
+
+    LOGGER.info("Running resegmentation with OmniSTEval...")
+    instances, instances_dict = resegment(
+        ref_words=ref_words,
+        hyp_words=hyp_words,
+        segmentation=segmentation,
+        ref_sentences=ref_sentences,
+        char_level=(latency_unit == "char"),
+        lang=moses_tokenizer_lang,
+        has_emission_timestamps=all_have_emission_ca,
+    )
+
+    LOGGER.info("Computing metrics...")
+    scores = evaluate_instances(
+        instances=instances,
+        compute_quality=True,
+        compute_latency=True,
+        is_longform=True,
+        bleu_tokenizer=bleu_tokenizer,
+        all_have_emission_ca=all_have_emission_ca,
+        fix_emission_ca_flag=False,
+        compute_comet=comet,
+        comet_model=comet_model,
+        source_sentences=source_sentences,
+    )
+
+    settings = _build_settings(
+        source_sentences_file=source_sentences_file,
+        audio_definition=audio_definition,
+        reference=reference,
+        log_file=log_file,
+        latency_unit=latency_unit,
+        eval_config=eval_config,
+        moses_tokenizer_lang=moses_tokenizer_lang,
+        bleu_tokenizer=bleu_tokenizer,
+        comet=comet,
+        comet_model=comet_model,
+    )
+    report = format_report("Longform evaluation (with resegmentation)", settings, scores)
+    LOGGER.info(f"\n{report}")
+
+    if output_folder is not None:
+        dump_instances_jsonl(instances_dict, output_folder)
+        dump_scores_tsv(scores, output_folder, is_longform=True)
+        with open(os.path.join(output_folder, "evaluation_report.txt"), "w", encoding="utf-8") as f:
+            f.write(report)
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(
+        "run_omnisteval",
+        description="Score streaming translation outputs using OmniSTEval."
+    )
+    parser.add_argument("--eval-config", type=str, required=True)
+    parser.add_argument("--log-file", type=str, required=True)
+    parser.add_argument(
+        "--audio-definition",
+        "-a",
+        type=str,
+        required=True,
+        help="Path to the yaml file containing the segment-level audio information.",
+    )
+    parser.add_argument(
+        "--reference",
+        "-r",
+        type=str,
+        required=True,
+        help="Path to the textual file containing segment-level references stored line by line.",
+    )
+    parser.add_argument(
+        "--output-folder",
+        type=str,
+        default=None,
+        help="Optional output folder for OmniSTEval artifacts.",
+    )
+    parser.add_argument(
+        "--latency-unit",
+        choices=["word", "char"],
+        default="word",
+        help="Whether to compute stats based on words or characters. Default: word.",
+    )
+    parser.add_argument(
+        "--bleu-tokenizer",
+        choices=sacrebleu.metrics.METRICS["BLEU"].TOKENIZERS,
+        default=sacrebleu.metrics.METRICS["BLEU"].TOKENIZER_DEFAULT,
+    )
+    parser.add_argument(
+        "--moses-tokenizer-lang",
+        type=str,
+        default="en",
+        help='Language code for Moses tokenizer if BLEU tokenizer is set to "13a". Default: en.',
+    )
+    parser.add_argument(
+        "--comet-model",
+        type=str,
+        default="Unbabel/wmt22-comet-da",
+        help=(
+            "Name or path of the COMET model to use for quality estimation when --comet is enabled. "
+            "Default: Unbabel/wmt22-comet-da."),
+    )
+    parser.add_argument("--comet", action="store_true", help="Enable COMET scoring.")
+    parser.add_argument("--source-sentences-file", type=str, default=None)
+    args = parser.parse_args()
+
+    main(
+        audio_definition=args.audio_definition,
+        reference=args.reference,
+        log_file=args.log_file,
+        eval_config=args.eval_config,
+        output_folder=args.output_folder,
+        latency_unit=args.latency_unit,
+        bleu_tokenizer=args.bleu_tokenizer,
+        moses_tokenizer_lang=args.moses_tokenizer_lang,
+        comet=args.comet,
+        comet_model=args.comet_model,
+        source_sentences_file=args.source_sentences_file,
+    )
+
+
+if __name__ == "__main__":
+    cli_main()

From beead321d5e74e4ba6c26b31ffda5bffd743ff13 Mon Sep 17 00:00:00 2001
From: Peter Polak <polak@ufal.mff.cuni.cz>
Date: Mon, 2 Mar 2026 00:34:18 +0100
Subject: [PATCH 2/2] fix format

---
 simulstream/metrics/score_omnisteval.py | 43 ++++++++++++++-----------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/simulstream/metrics/score_omnisteval.py b/simulstream/metrics/score_omnisteval.py
index fa34810..f4e55bf 100644
--- a/simulstream/metrics/score_omnisteval.py
+++ b/simulstream/metrics/score_omnisteval.py
@@ -7,7 +7,12 @@
 import sacrebleu
 
 import simulstream
-from omnisteval.io import load_resegmentation_inputs as load_inputs, dump_instances_jsonl, dump_scores_tsv, format_report
+from omnisteval.io import (
+    load_resegmentation_inputs as load_inputs,
+    dump_instances_jsonl,
+    dump_scores_tsv,
+    format_report,
+)
 from omnisteval import resegment
 from omnisteval import evaluate_instances
 
@@ -30,7 +35,8 @@ def _build_settings(
     moses_tokenizer_lang: Optional[str],
     bleu_tokenizer: str,
     comet: bool,
-    comet_model: str,):
+    comet_model: str,
+):
     """
     Build a dictionary of settings to log alongside the scores.
     This is not used for the actual scoring, but can be helpful for record-keeping and debugging.
@@ -79,18 +85,16 @@ def main(
             source_sentences = [line.strip() for line in f]
 
     LOGGER.info("Loading resegmentation inputs for OmniSTEval...")
-    ref_words, hyp_words, segmentation, ref_sentences, all_have_emission_ca = (
-        load_inputs(
-            audio_definition,
-            None,
-            reference,
-            log_file,
-            hypothesis_format="simulstream",
-            char_level=(latency_unit == "char"),
-            offset_delays=False,
-            fix_emission_ca_flag=False,
-            simulstream_config_file=eval_config,
-        )
+    ref_words, hyp_words, segmentation, ref_sentences, all_have_emission_ca = load_inputs(
+        audio_definition,
+        None,
+        reference,
+        log_file,
+        hypothesis_format="simulstream",
+        char_level=(latency_unit == "char"),
+        offset_delays=False,
+        fix_emission_ca_flag=False,
+        simulstream_config_file=eval_config,
     )
 
     # suppress mosestokenizer INFO logs which are very verbose
@@ -139,14 +143,16 @@ def main(
     if output_folder is not None:
         dump_instances_jsonl(instances_dict, output_folder)
         dump_scores_tsv(scores, output_folder, is_longform=True)
-        with open(os.path.join(output_folder, "evaluation_report.txt"), "w", encoding="utf-8") as f:
+        with open(
+            os.path.join(output_folder, "evaluation_report.txt"), "w", encoding="utf-8"
+        ) as f:
             f.write(report)
 
 
 def cli_main():
     parser = argparse.ArgumentParser(
         "run_omnisteval",
-        description="Score streaming translation outputs using OmniSTEval."
+        description="Score streaming translation outputs using OmniSTEval.",
     )
     parser.add_argument("--eval-config", type=str, required=True)
     parser.add_argument("--log-file", type=str, required=True)
@@ -192,8 +198,9 @@ def cli_main():
         type=str,
         default="Unbabel/wmt22-comet-da",
         help=(
-            "Name or path of the COMET model to use for quality estimation when --comet is enabled. "
-            "Default: Unbabel/wmt22-comet-da."),
+            "Name or path of the COMET model to use for quality estimation when --comet is "
+            "enabled. Default: Unbabel/wmt22-comet-da."
+        ),
     )
     parser.add_argument("--comet", action="store_true", help="Enable COMET scoring.")
     parser.add_argument("--source-sentences-file", type=str, default=None)