crystal-technologies
/

CRYSTAL-R1

Model card Files Files and versions Community

CRYSTAL-R1 / SoundScribe /SpeakerID /scripts /confidence_ensembles /test_confidence_ensembles.py

crystal-technologies

Upload 1287 files

2d8da09 over 1 year ago

raw

history blame

4.75 kB

	# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# these tests are not included in CI, since they take moderate amount of time
	# they are supposed to be run in the nightly pipeline instead

	import os
	import subprocess
	import sys
	from pathlib import Path

	import pytest

	from nemo.collections.asr.parts.utils.transcribe_utils import TextProcessingConfig

	sys.path.append(str(Path(__file__).parents[2] / 'examples' / 'asr'))
	import speech_to_text_eval


	@pytest.mark.parametrize(
	'build_args',
	[
	"ensemble.0.model=stt_es_conformer_ctc_large ensemble.1.model=stt_it_conformer_ctc_large",
	"ensemble.0.model=stt_es_conformer_transducer_large ensemble.1.model=stt_it_conformer_transducer_large",
	(
	"ensemble.0.model=stt_es_fastconformer_hybrid_large_pc ensemble.1.model=stt_it_fastconformer_hybrid_large_pc "
	"confidence.method_cfg.alpha=0.33 confidence.method_cfg.entropy_norm=exp "
	),
	(
	"ensemble.0.model=stt_es_fastconformer_hybrid_large_pc "
	"ensemble.1.model=stt_it_fastconformer_hybrid_large_pc "
	"transcription.decoder_type=ctc "
	),
	"ensemble.0.model=stt_es_conformer_ctc_large ensemble.1.model=stt_it_conformer_transducer_large",
	(
	"ensemble.0.model=stt_es_conformer_ctc_large "
	"ensemble.1.model=stt_it_conformer_ctc_large "
	f"ensemble.0.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'es' / 'dev_manifest.json'} "
	f"ensemble.1.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'it' / 'dev_manifest.json'} "
	"tune_confidence=True "
	),
	(
	"ensemble.0.model=stt_es_conformer_transducer_large "
	"ensemble.1.model=stt_it_conformer_transducer_large "
	f"ensemble.0.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'es' / 'dev_manifest.json'} "
	f"ensemble.1.dev_manifest={Path(os.getenv('TEST_DATA_PATH', '')) / 'it' / 'dev_manifest.json'} "
	"tune_confidence=True "
	),
	],
	ids=(
	[
	"CTC models",
	"Transducer models",
	"Hybrid models (Transducer mode)",
	"Hybrid models (CTC mode)",
	"CTC + Transducer",
	"CTC models + confidence tuning",
	"Transducer models + confidence tuning",
	]
	),
	)
	def test_confidence_ensemble(tmp_path, build_args):
	"""Integration tests for confidence-ensembles.

	Tests building ensemble and running inference with the model.
	To use, make sure to define TEST_DATA_PATH env variable with path to
	the test data. The following structure is assumed:

	$TEST_DATA_PATH
	├── es
	│ ├── dev
	│ ├── dev_manifest.json
	│ ├── test
	│ ├── train
	│ └── train_manifest.json
	├── it
	│ ├── dev
	│ ├── dev_manifest.json
	│ ├── test
	│ ├── test_manifest.json
	│ ├── train
	│ └── train_manifest.json
	└── test_manifest.json

	"""
	# checking for test data and failing right away if not defined
	if not os.getenv("TEST_DATA_PATH"):
	raise ValueError("TEST_DATA_PATH env variable has to be defined!")

	test_data_path = Path(os.environ['TEST_DATA_PATH'])

	build_ensemble_cmd = f"""
	python {Path(__file__).parent / 'build_ensemble.py'} \
	--config-name=ensemble_config.yaml \
	output_path={tmp_path / 'ensemble.nemo'} \
	{build_args}
	"""
	subprocess.run(build_ensemble_cmd, check=True, shell=True)

	eval_cfg = speech_to_text_eval.EvaluationConfig(
	dataset_manifest=str(test_data_path / 'test_manifest.json'),
	output_filename=str(tmp_path / 'output.json'),
	model_path=str(tmp_path / 'ensemble.nemo'),
	text_processing=TextProcessingConfig(punctuation_marks=".,?", do_lowercase=True, rm_punctuation=True),
	)

	results = speech_to_text_eval.main(eval_cfg)
	assert results.metric_value < 0.20 # relaxed check for better than 20% WER