hlt-mt · mgaido91 · Mar 4, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 3, 2026
diff --git a/README.md b/README.md
@@ -177,14 +177,15 @@ can score your speech processor by running:
 simulstream_score_latency --scorer stream_laal \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --reference REFERENCE_FILE.txt \
+    --reference REFERENCES_FILE.tgt \
     --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 
 simulstream_score_quality --scorer comet \
     --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl \
-    --references REFERENCES_FILE.txt \
-    --transcripts TRANSCRIPTS_FILE.txt
+    --references REFERENCES_FILE.tgt \
+    --transcripts TRANSCRIPTS_FILE.src \
+    --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml
 
 simulstream_stats --eval-config config/speech_processor.yaml \
     --log-file metrics.jsonl
@@ -198,7 +199,20 @@ the selected metric (``--scorer``).
 
 Similarly, ``simulstream_score_quality`` evaluated the quality
 of the generated outputs against one (or more) reference (and transcript, only for metrics
-requiring them) file(s).
+requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions
+in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`.
+
+As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of 
+files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension) 
+**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance:
+
+```
+simulstream_score_quality --scorer comet \
+    --eval-config config/speech_processor.yaml \
+    --log-file metrics.jsonl \
+    --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \
+    --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src
+```
 
 Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio.
 

diff --git a/simulstream/metrics/score_quality.py b/simulstream/metrics/score_quality.py
@@ -124,6 +124,19 @@ def cli_main():
             --log-file metrics.jsonl \\
             --references ref.en \\
             --transcripts src.it \\
+            --audio-definition audio_def.yaml \\
+            --scorer sacrebleu
+
+    Otherwise, the script can be invoked without specifying the `--audio-definition`,
+    but in this case the name of the refererence and transcript files (trimmed of
+    the extension) must be the same of the audio files used (i.e. the names present
+    in `metrics.jsonl`), e.g.:
+
+        $ python -m simulstream.metrics.score_quality \\
+            --eval-config config/speech-processor.yaml \\
+            --log-file metrics.jsonl \\
+            --references 1.en,2.en \\
+            --transcripts 1.it,2.it \\
             --scorer sacrebleu
     """
     LOGGER.info(f"Simulstream version: {simulstream.__version__}")
@@ -140,14 +153,17 @@ def cli_main():
              "specified, this should be a single file containing all the lines of the audios in "
              "the reference, which should be of the same length of the audio definition. "
              "Otherwise, this should be a list of files, where each contains the lines "
-             "corresponding to an audio file.")
+             "corresponding to an audio file. In the case of being a list of files, the file "
+             "stem must match a corresponding transcript for an audio file (if applicable "
+             "to the quality metric).")
     parser.add_argument(
         "--transcripts", nargs="+", type=str,
         help="Path to the textual files containing reference transcripts. If `--audio-definition` "
              "is specified, this should be a single file containing all the lines of the audios "
              "in the reference, which should be of the same length of the audio definition. "
              "Otherwise, this should be a list of files, where each contains the lines "
-             "corresponding to an audio file.")
+             "corresponding to an audio file. In the case of being a list of files, the file "
+             "stem must match a corresponding reference for an audio file.")
     parser.add_argument(
         "--audio-definition", "-a", type=str, default=None,
         help="Path to the yaml file containing the segment-level audio information.")