diff --git a/README.md b/README.md index 27aaf41..d1604e7 100644 --- a/README.md +++ b/README.md @@ -177,14 +177,15 @@ can score your speech processor by running: simulstream_score_latency --scorer stream_laal \ --eval-config config/speech_processor.yaml \ --log-file metrics.jsonl \ - --reference REFERENCE_FILE.txt \ + --reference REFERENCES_FILE.tgt \ --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml simulstream_score_quality --scorer comet \ --eval-config config/speech_processor.yaml \ --log-file metrics.jsonl \ - --references REFERENCES_FILE.txt \ - --transcripts TRANSCRIPTS_FILE.txt + --references REFERENCES_FILE.tgt \ + --transcripts TRANSCRIPTS_FILE.src \ + --audio-definition YAML_AUDIO_REFERENCES_DEFINITION.yaml simulstream_stats --eval-config config/speech_processor.yaml \ --log-file metrics.jsonl @@ -198,7 +199,20 @@ the selected metric (``--scorer``). Similarly, ``simulstream_score_quality`` evaluated the quality of the generated outputs against one (or more) reference (and transcript, only for metrics -requiring them) file(s). +requiring them) file(s). Here, the `YAML_AUDIO_REFERENCES_DEFINITION.yaml` has the same number of entries (sentence definitions +in terms of wav file origin, offset and duration) as `REFERENCES_FILE.tgt` and `TRANSCRIPTS_FILE.src`. + +As an alternative, `simulstream_score_quality` can be run without the `--audio-definition` specification, by using a list of +files as arguments of `--references` and `--transcripts`. In this case, the name of the files (trimmed of the extension) +**must be the same** of the audio files used (i.e. the names present in `metrics.jsonl`). For instance: + +``` +simulstream_score_quality --scorer comet \ + --eval-config config/speech_processor.yaml \ + --log-file metrics.jsonl \ + --references AUDIO1.tgt,AUDIO2.tgt,AUDIO3.tgt \ + --transcripts AUDIO1.src,AUDIO2.src,AUDIO3.src +``` Lastly, ``simulstream_stats`` computes statistics like the computational cost and flickering ratio. diff --git a/simulstream/metrics/score_quality.py b/simulstream/metrics/score_quality.py index 0a621d3..6479f98 100644 --- a/simulstream/metrics/score_quality.py +++ b/simulstream/metrics/score_quality.py @@ -124,6 +124,19 @@ def cli_main(): --log-file metrics.jsonl \\ --references ref.en \\ --transcripts src.it \\ + --audio-definition audio_def.yaml \\ + --scorer sacrebleu + + Otherwise, the script can be invoked without specifying the `--audio-definition`, + but in this case the name of the refererence and transcript files (trimmed of + the extension) must be the same of the audio files used (i.e. the names present + in `metrics.jsonl`), e.g.: + + $ python -m simulstream.metrics.score_quality \\ + --eval-config config/speech-processor.yaml \\ + --log-file metrics.jsonl \\ + --references 1.en,2.en \\ + --transcripts 1.it,2.it \\ --scorer sacrebleu """ LOGGER.info(f"Simulstream version: {simulstream.__version__}") @@ -140,14 +153,17 @@ def cli_main(): "specified, this should be a single file containing all the lines of the audios in " "the reference, which should be of the same length of the audio definition. " "Otherwise, this should be a list of files, where each contains the lines " - "corresponding to an audio file.") + "corresponding to an audio file. In the case of being a list of files, the file " + "stem must match a corresponding transcript for an audio file (if applicable " + "to the quality metric).") parser.add_argument( "--transcripts", nargs="+", type=str, help="Path to the textual files containing reference transcripts. If `--audio-definition` " "is specified, this should be a single file containing all the lines of the audios " "in the reference, which should be of the same length of the audio definition. " "Otherwise, this should be a list of files, where each contains the lines " - "corresponding to an audio file.") + "corresponding to an audio file. In the case of being a list of files, the file " + "stem must match a corresponding reference for an audio file.") parser.add_argument( "--audio-definition", "-a", type=str, default=None, help="Path to the yaml file containing the segment-level audio information.")