# Copyright 2021 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest import numpy as np import pytest from datasets import load_dataset from huggingface_hub import hf_hub_download, snapshot_download from transformers import ( MODEL_FOR_CTC_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, AutoFeatureExtractor, AutoProcessor, AutoTokenizer, Speech2TextForConditionalGeneration, Wav2Vec2ForCTC, WhisperForConditionalGeneration, ) from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline from transformers.pipelines.audio_utils import chunk_bytes_iter from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter from transformers.testing_utils import ( is_pipeline_test, is_torch_available, nested_simplify, require_pyctcdecode, require_tf, require_torch, require_torch_gpu, require_torchaudio, slow, ) from .test_pipelines_common import ANY if is_torch_available(): import torch # We can't use this mixin because it assumes TF support. # from .test_pipelines_common import CustomInputPipelineCommonMixin @is_pipeline_test class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): model_mapping = dict( (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else []) + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else []) ) def get_test_pipeline(self, model, tokenizer, processor): if tokenizer is None: # Side effect of no Fast Tokenizer class for these model, so skipping # But the slow tokenizer test should still run as they're quite small self.skipTest("No tokenizer available") return # return None, None speech_recognizer = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=processor ) # test with a raw waveform audio = np.zeros((34000,)) audio2 = np.zeros((14000,)) return speech_recognizer, [audio, audio2] def run_pipeline_test(self, speech_recognizer, examples): audio = np.zeros((34000,)) outputs = speech_recognizer(audio) self.assertEqual(outputs, {"text": ANY(str)}) # Striding audio = {"raw": audio, "stride": (0, 4000), "sampling_rate": speech_recognizer.feature_extractor.sampling_rate} if speech_recognizer.type == "ctc": outputs = speech_recognizer(audio) self.assertEqual(outputs, {"text": ANY(str)}) elif "Whisper" in speech_recognizer.model.__class__.__name__: outputs = speech_recognizer(audio) self.assertEqual(outputs, {"text": ANY(str)}) else: # Non CTC models cannot use striding. with self.assertRaises(ValueError): outputs = speech_recognizer(audio) # Timestamps audio = np.zeros((34000,)) if speech_recognizer.type == "ctc": outputs = speech_recognizer(audio, return_timestamps="char") self.assertIsInstance(outputs["chunks"], list) n = len(outputs["chunks"]) self.assertEqual( outputs, { "text": ANY(str), "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(n)], }, ) outputs = speech_recognizer(audio, return_timestamps="word") self.assertIsInstance(outputs["chunks"], list) n = len(outputs["chunks"]) self.assertEqual( outputs, { "text": ANY(str), "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(n)], }, ) elif "Whisper" in speech_recognizer.model.__class__.__name__: outputs = speech_recognizer(audio, return_timestamps=True) self.assertIsInstance(outputs["chunks"], list) nb_chunks = len(outputs["chunks"]) self.assertGreater(nb_chunks, 0) self.assertEqual( outputs, { "text": ANY(str), "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(nb_chunks)], }, ) else: # Non CTC models cannot use return_timestamps with self.assertRaisesRegex( ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$" ): outputs = speech_recognizer(audio, return_timestamps="char") @require_torch @slow def test_pt_defaults(self): pipeline("automatic-speech-recognition", framework="pt") @require_torch def test_small_model_pt(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="facebook/s2t-small-mustc-en-fr-st", tokenizer="facebook/s2t-small-mustc-en-fr-st", framework="pt", ) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = speech_recognizer(waveform) self.assertEqual(output, {"text": "(Applaudissements)"}) output = speech_recognizer(waveform, chunk_length_s=10) self.assertEqual(output, {"text": "(Applaudissements)"}) # Non CTC models cannot use return_timestamps with self.assertRaisesRegex( ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$" ): _ = speech_recognizer(waveform, return_timestamps="char") @slow @require_torch def test_whisper_fp16(self): if not torch.cuda.is_available(): self.skipTest("Cuda is necessary for this test") speech_recognizer = pipeline( model="openai/whisper-base", device=0, torch_dtype=torch.float16, ) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) speech_recognizer(waveform) @require_torch def test_small_model_pt_seq2seq(self): speech_recognizer = pipeline( model="hf-internal-testing/tiny-random-speech-encoder-decoder", framework="pt", ) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = speech_recognizer(waveform) self.assertEqual(output, {"text": "あл ش 湯 清 ه ܬ া लᆨしث ल eか u w 全 u"}) @require_torch def test_small_model_pt_seq2seq_gen_kwargs(self): speech_recognizer = pipeline( model="hf-internal-testing/tiny-random-speech-encoder-decoder", framework="pt", ) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = speech_recognizer(waveform, max_new_tokens=10, generate_kwargs={"num_beams": 2}) self.assertEqual(output, {"text": "あл † γ ت ב オ 束 泣 足"}) @slow @require_torch @require_pyctcdecode def test_large_model_pt_with_lm(self): dataset = load_dataset("Narsil/asr_dummy", streaming=True) third_item = next(iter(dataset["test"].skip(3))) filename = third_item["file"] speech_recognizer = pipeline( task="automatic-speech-recognition", model="patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm", framework="pt", ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") output = speech_recognizer(filename) self.assertEqual( output, {"text": "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumaje"}, ) # Override back to pure CTC speech_recognizer.type = "ctc" output = speech_recognizer(filename) # plumajre != plumaje self.assertEqual( output, { "text": ( "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajre" ) }, ) speech_recognizer.type = "ctc_with_lm" # Simple test with CTC with LM, chunking + timestamps output = speech_recognizer(filename, chunk_length_s=2.0, return_timestamps="word") self.assertEqual( output, { "text": ( "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajcri" ), "chunks": [ {"text": "y", "timestamp": (0.52, 0.54)}, {"text": "en", "timestamp": (0.6, 0.68)}, {"text": "las", "timestamp": (0.74, 0.84)}, {"text": "ramas", "timestamp": (0.94, 1.24)}, {"text": "medio", "timestamp": (1.32, 1.52)}, {"text": "sumergidas", "timestamp": (1.56, 2.22)}, {"text": "revoloteaban", "timestamp": (2.36, 3.0)}, {"text": "algunos", "timestamp": (3.06, 3.38)}, {"text": "pájaros", "timestamp": (3.46, 3.86)}, {"text": "de", "timestamp": (3.92, 4.0)}, {"text": "quimérico", "timestamp": (4.08, 4.6)}, {"text": "y", "timestamp": (4.66, 4.68)}, {"text": "legendario", "timestamp": (4.74, 5.26)}, {"text": "plumajcri", "timestamp": (5.34, 5.74)}, ], }, ) # CTC + LM models cannot use return_timestamps="char" with self.assertRaisesRegex( ValueError, "^CTC with LM can only predict word level timestamps, set `return_timestamps='word'`$" ): _ = speech_recognizer(filename, return_timestamps="char") @require_tf def test_small_model_tf(self): self.skipTest("Tensorflow not supported yet.") @require_torch def test_torch_small_no_tokenizer_files(self): # test that model without tokenizer file cannot be loaded with pytest.raises(OSError): pipeline( task="automatic-speech-recognition", model="patrickvonplaten/tiny-wav2vec2-no-tokenizer", framework="pt", ) @require_torch @slow def test_torch_large(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="facebook/wav2vec2-base-960h", tokenizer="facebook/wav2vec2-base-960h", framework="pt", ) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = speech_recognizer(waveform) self.assertEqual(output, {"text": ""}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @require_torch @slow def test_return_timestamps_in_preprocess(self): pipe = pipeline( task="automatic-speech-recognition", model="openai/whisper-tiny", chunk_length_s=8, stride_length_s=1, ) data = load_dataset("librispeech_asr", "clean", split="test", streaming=True) sample = next(iter(data)) pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe") res = pipe(sample["audio"]["array"]) self.assertEqual(res, {"text": " Conquered returned to its place amidst the tents."}) res = pipe(sample["audio"]["array"], return_timestamps=True) self.assertEqual( res, { "text": " Conquered returned to its place amidst the tents.", "chunks": [{"text": " Conquered returned to its place amidst the tents.", "timestamp": (0.0, 3.36)}], }, ) pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]] res = pipe(sample["audio"]["array"], return_timestamps="word") # fmt: off # Note that the word-level timestamps predicted here are pretty bad. self.assertEqual( res, { "text": " Conquered returned to its place amidst the tents.", "chunks": [ {'text': ' Conquered', 'timestamp': (29.78, 29.9)}, {'text': ' returned', 'timestamp': (29.9, 29.9)}, {'text': ' to', 'timestamp': (29.9, 29.9)}, {'text': ' its', 'timestamp': (29.9, 29.9)}, {'text': ' place', 'timestamp': (29.9, 29.9)}, {'text': ' amidst', 'timestamp': (29.9, 29.9)}, {'text': ' the', 'timestamp': (29.9, 29.9)}, {'text': ' tents.', 'timestamp': (29.9, 29.9)} ] } ) # fmt: on @require_torch def test_return_timestamps_in_init(self): # segment-level timestamps are accepted model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny") feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") dummy_speech = np.ones(100) pipe = pipeline( task="automatic-speech-recognition", model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, chunk_length_s=8, stride_length_s=1, return_timestamps=True, ) _ = pipe(dummy_speech) # word-level timestamps are accepted pipe = pipeline( task="automatic-speech-recognition", model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, chunk_length_s=8, stride_length_s=1, return_timestamps="word", ) _ = pipe(dummy_speech) # char-level timestamps are not accepted with self.assertRaisesRegex( ValueError, "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. " "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$", ): pipe = pipeline( task="automatic-speech-recognition", model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, chunk_length_s=8, stride_length_s=1, return_timestamps="char", ) _ = pipe(dummy_speech) @require_torch @slow def test_torch_whisper(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="openai/whisper-tiny", framework="pt", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) output = speech_recognizer([filename], chunk_length_s=5, batch_size=4) self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}]) @slow def test_find_longest_common_subsequence(self): max_source_positions = 1500 processor = AutoProcessor.from_pretrained("openai/whisper-tiny") previous_sequence = [[51492, 406, 3163, 1953, 466, 13, 51612, 51612]] self.assertEqual( processor.decode(previous_sequence[0], output_offsets=True), { "text": " not worth thinking about.", "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}], }, ) # Merge when the previous sequence is a suffix of the next sequence # fmt: off next_sequences_1 = [ [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257] ] # fmt: on self.assertEqual( processor.decode(next_sequences_1[0], output_offsets=True), { "text": ( " of spectators, retrievality is not worth thinking about. His instant panic was followed by a" " small, sharp blow high on his chest.<|endoftext|>" ), "offsets": [ {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)}, { "text": " His instant panic was followed by a small, sharp blow high on his chest.", "timestamp": (5.0, 9.4), }, ], }, ) merge = _find_timestamp_sequence( [[previous_sequence, (480_000, 0, 0)], [next_sequences_1, (480_000, 120_000, 0)]], processor.tokenizer, processor.feature_extractor, max_source_positions, ) # fmt: off self.assertEqual( merge, [51492, 406, 3163, 1953, 466, 13, 51739, 51739, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959], ) # fmt: on self.assertEqual( processor.decode(merge, output_offsets=True), { "text": ( " not worth thinking about. His instant panic was followed by a small, sharp blow high on his" " chest." ), "offsets": [ {"text": " not worth thinking about.", "timestamp": (22.56, 27.5)}, { "text": " His instant panic was followed by a small, sharp blow high on his chest.", "timestamp": (27.5, 31.900000000000002), }, ], }, ) # Merge when the sequence is in the middle of the 1st next sequence # fmt: off next_sequences_2 = [ [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257] ] # fmt: on # {'text': ' of spectators, retrievality is not worth thinking about. His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)} merge = _find_timestamp_sequence( [[previous_sequence, (480_000, 0, 0)], [next_sequences_2, (480_000, 120_000, 0)]], processor.tokenizer, processor.feature_extractor, max_source_positions, ) # fmt: off self.assertEqual( merge, [51492, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959], ) # fmt: on self.assertEqual( processor.decode(merge, output_offsets=True), { "text": ( " not worth thinking about. His instant panic was followed by a small, sharp blow high on his" " chest." ), "offsets": [ { "text": ( " not worth thinking about. His instant panic was followed by a small, sharp blow high on" " his chest." ), "timestamp": (22.56, 31.900000000000002), }, ], }, ) # Merge when the previous sequence is not included in the current sequence # fmt: off next_sequences_3 = [[50364, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50584, 50257]] # fmt: on # {'text': ' His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)} merge = _find_timestamp_sequence( [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 120_000, 0)]], processor.tokenizer, processor.feature_extractor, max_source_positions, ) # fmt: off self.assertEqual( merge, [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51832], ) # fmt: on self.assertEqual( processor.decode(merge, output_offsets=True), { "text": ( " not worth thinking about. His instant panic was followed by a small, sharp blow high on his" " chest." ), "offsets": [ {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}, { "text": " His instant panic was followed by a small, sharp blow high on his chest.", "timestamp": (24.96, 29.36), }, ], }, ) # last case is when the sequence is not in the first next predicted start and end of timestamp # fmt: off next_sequences_3 = [ [50364, 2812, 9836, 14783, 390, 406, 3163, 1953, 466, 13, 50634, 50634, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50934] ] # fmt: on merge = _find_timestamp_sequence( [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 167_000, 0)]], processor.tokenizer, processor.feature_extractor, max_source_positions, ) # fmt: off self.assertEqual( merge, [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51912] ) # fmt: on self.assertEqual( processor.decode(merge, output_offsets=True), { "text": ( " not worth thinking about. His instant panic was followed by a small, sharp blow high on his" " chest." ), "offsets": [ {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}, { "text": " His instant panic was followed by a small, sharp blow high on his chest.", "timestamp": (24.96, 30.96), }, ], }, ) @slow @require_torch def test_whisper_timestamp_prediction(self): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") array = np.concatenate( [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] ) pipe = pipeline( model="openai/whisper-small", return_timestamps=True, ) output = pipe(ds[40]["audio"]) self.assertDictEqual( output, { "text": " A man said to the universe, Sir, I exist.", "chunks": [{"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 4.26)}], }, ) output = pipe(array, chunk_length_s=10) self.assertDictEqual( nested_simplify(output), { "chunks": [ {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)}, { "text": ( " Sweat covered Brion's body, trickling into the " "tight-loan cloth that was the only garment he wore, the " "cut" ), "timestamp": (5.5, 11.95), }, { "text": ( " on his chest still dripping blood, the ache of his " "overstrained eyes, even the soaring arena around him " "with" ), "timestamp": (11.95, 19.61), }, { "text": " the thousands of spectators, retrievality is not worth thinking about.", "timestamp": (19.61, 25.0), }, { "text": " His instant panic was followed by a small, sharp blow high on his chest.", "timestamp": (25.0, 29.4), }, ], "text": ( " A man said to the universe, Sir, I exist. Sweat covered Brion's " "body, trickling into the tight-loan cloth that was the only garment " "he wore, the cut on his chest still dripping blood, the ache of his " "overstrained eyes, even the soaring arena around him with the " "thousands of spectators, retrievality is not worth thinking about. " "His instant panic was followed by a small, sharp blow high on his " "chest." ), }, ) output = pipe(array) self.assertDictEqual( output, { "chunks": [ {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)}, { "text": ( " Sweat covered Brion's body, trickling into the " "tight-loan cloth that was the only garment" ), "timestamp": (5.5, 10.18), }, {"text": " he wore.", "timestamp": (10.18, 11.68)}, {"text": " The cut on his chest still dripping blood.", "timestamp": (11.68, 14.92)}, {"text": " The ache of his overstrained eyes.", "timestamp": (14.92, 17.6)}, { "text": ( " Even the soaring arena around him with the thousands of spectators were trivialities" ), "timestamp": (17.6, 22.56), }, {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}, ], "text": ( " A man said to the universe, Sir, I exist. Sweat covered Brion's " "body, trickling into the tight-loan cloth that was the only garment " "he wore. The cut on his chest still dripping blood. The ache of his " "overstrained eyes. Even the soaring arena around him with the " "thousands of spectators were trivialities not worth thinking about." ), }, ) @require_torch @slow def test_torch_speech_encoder_decoder(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de", framework="pt", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'}) @slow @require_torch def test_simple_wav2vec2(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = asr(waveform) self.assertEqual(output, {"text": ""}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) filename = ds[40]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @slow @require_torch @require_torchaudio def test_simple_s2t(self): model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st") tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st") feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st") asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = asr(waveform) self.assertEqual(output, {"text": "(Applausi)"}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) filename = ds[40]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) @slow @require_torch @require_torchaudio def test_simple_whisper_asr(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="openai/whisper-tiny.en", framework="pt", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] output = speech_recognizer(filename) self.assertEqual( output, {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."}, ) output = speech_recognizer(filename, return_timestamps=True) self.assertEqual( output, { "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.", "chunks": [ { "text": ( " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel." ), "timestamp": (0.0, 5.44), } ], }, ) speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]] output = speech_recognizer(filename, return_timestamps="word") # fmt: off self.assertEqual( output, { "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.", "chunks": [ {'text': ' Mr.', 'timestamp': (0.0, 1.02)}, {'text': ' Quilter', 'timestamp': (1.02, 1.18)}, {'text': ' is', 'timestamp': (1.18, 1.44)}, {'text': ' the', 'timestamp': (1.44, 1.58)}, {'text': ' apostle', 'timestamp': (1.58, 1.98)}, {'text': ' of', 'timestamp': (1.98, 2.3)}, {'text': ' the', 'timestamp': (2.3, 2.46)}, {'text': ' middle', 'timestamp': (2.46, 2.56)}, {'text': ' classes,', 'timestamp': (2.56, 3.38)}, {'text': ' and', 'timestamp': (3.38, 3.52)}, {'text': ' we', 'timestamp': (3.52, 3.6)}, {'text': ' are', 'timestamp': (3.6, 3.72)}, {'text': ' glad', 'timestamp': (3.72, 4.0)}, {'text': ' to', 'timestamp': (4.0, 4.26)}, {'text': ' welcome', 'timestamp': (4.26, 4.54)}, {'text': ' his', 'timestamp': (4.54, 4.92)}, {'text': ' gospel.', 'timestamp': (4.92, 6.66)}, ], }, ) # fmt: on # Whisper can only predict segment level timestamps or word level, not character level with self.assertRaisesRegex( ValueError, "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. " "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$", ): _ = speech_recognizer(filename, return_timestamps="char") @slow @require_torch @require_torchaudio def test_simple_whisper_translation(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="openai/whisper-large", framework="pt", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") tokenizer = AutoTokenizer.from_pretrained("openai/whisper-large") feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large") speech_recognizer_2 = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor ) output_2 = speech_recognizer_2(filename) self.assertEqual(output, output_2) # either use generate_kwargs or set the model's generation_config # model.generation_config.task = "transcribe" # model.generation_config.lang = "<|it|>" speech_translator = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, generate_kwargs={"task": "transcribe", "language": "<|it|>"}, ) output_3 = speech_translator(filename) self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."}) @slow @require_torch @require_torchaudio def test_xls_r_to_en(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="facebook/wav2vec2-xls-r-1b-21-to-en", feature_extractor="facebook/wav2vec2-xls-r-1b-21-to-en", framework="pt", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."}) @slow @require_torch @require_torchaudio def test_xls_r_from_en(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="facebook/wav2vec2-xls-r-1b-en-to-15", feature_extractor="facebook/wav2vec2-xls-r-1b-en-to-15", framework="pt", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."}) @slow @require_torch @require_torchaudio def test_speech_to_text_leveraged(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="patrickvonplaten/wav2vec2-2-bart-base", feature_extractor="patrickvonplaten/wav2vec2-2-bart-base", tokenizer=AutoTokenizer.from_pretrained("patrickvonplaten/wav2vec2-2-bart-base"), framework="pt", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "a man said to the universe sir i exist"}) @slow @require_torch_gpu def test_wav2vec2_conformer_float16(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="facebook/wav2vec2-conformer-rope-large-960h-ft", device="cuda:0", torch_dtype=torch.float16, framework="pt", ) dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] output = speech_recognizer(sample) self.assertEqual( output, {"text": "MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL"}, ) @require_torch def test_chunking_fast(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="hf-internal-testing/tiny-random-wav2vec2", chunk_length_s=10.0, ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 audio_tiled = np.tile(audio, n_repeats) output = speech_recognizer([audio_tiled], batch_size=2) self.assertEqual(output, [{"text": ANY(str)}]) self.assertEqual(output[0]["text"][:6], "ZBT ZC") @require_torch def test_return_timestamps_ctc_fast(self): speech_recognizer = pipeline( task="automatic-speech-recognition", model="hf-internal-testing/tiny-random-wav2vec2", ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") # Take short audio to keep the test readable audio = ds[40]["audio"]["array"][:800] output = speech_recognizer(audio, return_timestamps="char") self.assertEqual( output, { "text": "ZBT ZX G", "chunks": [ {"text": " ", "timestamp": (0.0, 0.012)}, {"text": "Z", "timestamp": (0.012, 0.016)}, {"text": "B", "timestamp": (0.016, 0.02)}, {"text": "T", "timestamp": (0.02, 0.024)}, {"text": " ", "timestamp": (0.024, 0.028)}, {"text": "Z", "timestamp": (0.028, 0.032)}, {"text": "X", "timestamp": (0.032, 0.036)}, {"text": " ", "timestamp": (0.036, 0.04)}, {"text": "G", "timestamp": (0.04, 0.044)}, ], }, ) output = speech_recognizer(audio, return_timestamps="word") self.assertEqual( output, { "text": "ZBT ZX G", "chunks": [ {"text": "ZBT", "timestamp": (0.012, 0.024)}, {"text": "ZX", "timestamp": (0.028, 0.036)}, {"text": "G", "timestamp": (0.04, 0.044)}, ], }, ) @require_torch @require_pyctcdecode def test_chunking_fast_with_lm(self): speech_recognizer = pipeline( model="hf-internal-testing/processor_with_lm", chunk_length_s=10.0, ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 audio_tiled = np.tile(audio, n_repeats) # Batch_size = 1 output1 = speech_recognizer([audio_tiled], batch_size=1) self.assertEqual(output1, [{"text": ANY(str)}]) self.assertEqual(output1[0]["text"][:6], "