list index out of range with word level timestamps

#131
by mkvbn - opened

Hey,

I'm trying to transcribe an audio with word level timestamps, but I keep getting an error.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[8], line 1
----> 1 asr_out = transcribing_pipe(
      2     '../SampleData/Saba_interview_short.wav',
      3     return_timestamps="word",
      4     generate_kwargs={"language": "danish"}
      5     )
      7 asr_out

File c:\Users\User\miniconda3\envs\vva\lib\site-packages\transformers\pipelines\automatic_speech_recognition.py:285, in AutomaticSpeechRecognitionPipeline.__call__(self, inputs, **kwargs)
    222 def __call__(
    223     self,
    224     inputs: Union[np.ndarray, bytes, str],
    225     **kwargs,
    226 ):
    227     """
    228     Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
    229     documentation for more information.
   (...)
    283                 `"".join(chunk["text"] for chunk in output["chunks"])`.
    284     """
--> 285     return super().__call__(inputs, **kwargs)

File c:\Users\User\miniconda3\envs\vva\lib\site-packages\transformers\pipelines\base.py:1235, in Pipeline.__call__(self, inputs, num_workers, batch_size, *args, **kwargs)
   1233     return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
   1234 elif self.framework == "pt" and isinstance(self, ChunkPipeline):
-> 1235     return next(
   1236         iter(
   1237             self.get_iterator(
   1238                 [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
   1239             )
   1240         )
   1241     )
   1242 else:
   1243     return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)

File c:\Users\User\miniconda3\envs\vva\lib\site-packages\transformers\pipelines\pt_utils.py:125, in PipelineIterator.__next__(self)
    123 # We're out of items within a batch
    124 item = next(self.iterator)
--> 125 processed = self.infer(item, **self.params)
    126 # We now have a batch of "inferred things".
    127 if self.loader_batch_size is not None:
    128     # Try to infer the size of the batch

File c:\Users\User\miniconda3\envs\vva\lib\site-packages\transformers\pipelines\automatic_speech_recognition.py:590, in AutomaticSpeechRecognitionPipeline.postprocess(self, model_outputs, decoder_kwargs, return_timestamps, return_language)
    587             stride_right /= sampling_rate
    588             output["stride"] = chunk_len, stride_left, stride_right
--> 590     text, optional = self.tokenizer._decode_asr(
    591         model_outputs,
    592         return_timestamps=return_timestamps,
    593         return_language=return_language,
    594         time_precision=time_precision,
    595     )
    596 else:
    597     items = np.concatenate(final_items, axis=1)

File c:\Users\User\miniconda3\envs\vva\lib\site-packages\transformers\models\whisper\tokenization_whisper.py:831, in WhisperTokenizer._decode_asr(self, model_outputs, return_timestamps, return_language, time_precision)
    830 def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
--> 831     return _decode_asr(
    832         self,
    833         model_outputs,
    834         return_timestamps=return_timestamps,
    835         return_language=return_language,
    836         time_precision=time_precision,
    837     )

File c:\Users\User\miniconda3\envs\vva\lib\site-packages\transformers\models\whisper\tokenization_whisper.py:1027, in _decode_asr(tokenizer, model_outputs, return_timestamps, return_language, time_precision)
   1025 current_tokens.append(token)
   1026 if return_timestamps == "word":
-> 1027     start_time = round(token_timestamps[i] + time_offset, 2)
   1028     print(start_time)
   1030     if i + 1 < len(token_timestamps):

IndexError: list index out of range

I've tried to debug it a bit with logging:

Tokens: 111
Current i: 4
Current token: 0.0
0.0
Tokens: 111
Current i: 5
Current token: 0.1599999964237213
0.16
Tokens: 111
Current i: 6
Current token: 0.2800000011920929
0.28
Tokens: 111
Current i: 7
Current token: 0.36000001430511475
0.36
Tokens: 111
Current i: 8
Current token: 0.6600000262260437
0.66
Tokens: 111
Current i: 9
Current token: 0.8399999737739563
0.84
Tokens: 111
Current i: 10
Current token: 1.0199999809265137
1.02
Tokens: 111
Current i: 11
Current token: 1.159999966621399
1.16
Tokens: 111
Current i: 12
Current token: 1.2599999904632568
1.26
Tokens: 111
Current i: 13
Current token: 1.2999999523162842
1.3
Tokens: 111
Current i: 14
Current token: 1.340000033378601
1.34
Tokens: 111
Current i: 15
Current token: 1.559999942779541
1.56
Tokens: 111
Current i: 16
Current token: 1.7999999523162842
1.8
Tokens: 111
Current i: 17
Current token: 1.9600000381469727
1.96
Tokens: 111
Current i: 18
Current token: 2.0999999046325684
2.1
Tokens: 111
Current i: 19
Current token: 2.2200000286102295
2.22
Tokens: 111
Current i: 20
Current token: 2.2799999713897705
2.28
Tokens: 111
Current i: 21
Current token: 2.5199999809265137
2.52
Tokens: 111
Current i: 22
Current token: 3.200000047683716
3.2
Tokens: 111
Current i: 23
Current token: 3.2799999713897705
3.28
Tokens: 111
Current i: 24
Current token: 3.3399999141693115
3.34
Tokens: 111
Current i: 25
Current token: 3.5199999809265137
3.52
Tokens: 111
Current i: 26
Current token: 3.619999885559082
3.62
Tokens: 111
Current i: 27
Current token: 3.6600000858306885
3.66
Tokens: 111
Current i: 28
Current token: 3.7200000286102295
3.72
Tokens: 111
Current i: 29
Current token: 3.799999952316284
3.8
Tokens: 111
Current i: 30
Current token: 3.880000114440918
3.88
Tokens: 111
Current i: 31
Current token: 3.9800000190734863
3.98
Tokens: 111
Current i: 32
Current token: 4.019999980926514
4.02
Tokens: 111
Current i: 33
Current token: 4.480000019073486
4.48
Tokens: 111
Current i: 34
Current token: 4.659999847412109
4.66
Tokens: 111
Current i: 35
Current token: 4.78000020980835
4.78
Tokens: 111
Current i: 36
Current token: 4.860000133514404
4.86
Tokens: 111
Current i: 37
Current token: 4.860000133514404
4.86
Tokens: 111
Current i: 38
Current token: 4.860000133514404
4.86
Tokens: 111
Current i: 39
Current token: 5.099999904632568
5.1
Tokens: 111
Current i: 40
Current token: 5.539999961853027
5.54
Tokens: 111
Current i: 41
Current token: 5.71999979019165
5.72
Tokens: 111
Current i: 42
Current token: 5.71999979019165
5.72
Tokens: 111
Current i: 43
Current token: 5.71999979019165
5.72
Tokens: 111
Current i: 44
Current token: 6.039999961853027
6.04
Tokens: 111
Current i: 45
Current token: 6.320000171661377
6.32
Tokens: 111
Current i: 46
Current token: 6.360000133514404
6.36
Tokens: 111
Current i: 47
Current token: 6.440000057220459
6.44
Tokens: 111
Current i: 48
Current token: 6.460000038146973
6.46
Tokens: 111
Current i: 49
Current token: 6.460000038146973
6.46
Tokens: 111
Current i: 50
Current token: 6.519999980926514
6.52
Tokens: 111
Current i: 51
Current token: 6.659999847412109
6.66
Tokens: 111
Current i: 52
Current token: 7.099999904632568
7.1
Tokens: 111
Current i: 53
Current token: 7.260000228881836
7.26
Tokens: 111
Current i: 54
Current token: 7.260000228881836
7.26
Tokens: 111
Current i: 55
Current token: 7.28000020980835
7.28
Tokens: 111
Current i: 56
Current token: 7.400000095367432
7.4
Tokens: 111
Current i: 57
Current token: 7.480000019073486
7.48
Tokens: 111
Current i: 58
Current token: 7.639999866485596
7.64
Tokens: 111
Current i: 59
Current token: 7.78000020980835
7.78
Tokens: 111
Current i: 60
Current token: 7.920000076293945
7.92
Tokens: 111
Current i: 61
Current token: 8.220000267028809
8.22
Tokens: 111
Current i: 62
Current token: 8.899999618530273
8.9
Tokens: 111
Current i: 63
Current token: 9.15999984741211
9.16
Tokens: 111
Current i: 64
Current token: 9.199999809265137
9.2
Tokens: 111
Current i: 65
Current token: 9.479999542236328
9.48
Tokens: 111
Current i: 66
Current token: 9.479999542236328
9.48
Tokens: 111
Current i: 67
Current token: 9.680000305175781
9.68
Tokens: 111
Current i: 68
Current token: 9.779999732971191
9.78
Tokens: 111
Current i: 69
Current token: 9.779999732971191
9.78
Tokens: 111
Current i: 70
Current token: 9.84000015258789
9.84
Tokens: 111
Current i: 71
Current token: 9.920000076293945
9.92
Tokens: 111
Current i: 72
Current token: 9.920000076293945
9.92
Tokens: 111
Current i: 73
Current token: 9.920000076293945
9.92
Tokens: 111
Current i: 74
Current token: 10.0600004196167
10.06
Tokens: 111
Current i: 75
Current token: 10.0600004196167
10.06
Tokens: 111
Current i: 76
Current token: 10.079999923706055
10.08
Tokens: 111
Current i: 77
Current token: 10.079999923706055
10.08
Tokens: 111
Current i: 78
Current token: 10.260000228881836
10.26
Tokens: 111
Current i: 79
Current token: 10.819999694824219
10.82
Tokens: 111
Current i: 80
Current token: 11.220000267028809
11.22
Tokens: 111
Current i: 81
Current token: 11.380000114440918
11.38
Tokens: 111
Current i: 82
Current token: 11.399999618530273
11.4
Tokens: 111
Current i: 83
Current token: 11.4399995803833
11.44
Tokens: 111
Current i: 84
Current token: 11.539999961853027
11.54
Tokens: 111
Current i: 85
Current token: 11.619999885559082
11.62
Tokens: 111
Current i: 86
Current token: 11.619999885559082
11.62
Tokens: 111
Current i: 87
Current token: 11.779999732971191
11.78
Tokens: 111
Current i: 88
Current token: 11.779999732971191
11.78
Tokens: 111
Current i: 89
Current token: 11.979999542236328
11.98
Tokens: 111
Current i: 90
Current token: 12.220000267028809
12.22
Tokens: 111
Current i: 91
Current token: 12.300000190734863
12.3
Tokens: 111
Current i: 92
Current token: 12.4399995803833
12.44
Tokens: 111
Current i: 93
Current token: 12.720000267028809
12.72
Tokens: 111
Current i: 94
Current token: 13.859999656677246
13.86
Tokens: 111
Current i: 95
Current token: 14.239999771118164
14.24
Tokens: 111
Current i: 96
Current token: 14.380000114440918
14.38
Tokens: 111
Current i: 97
Current token: 14.5
14.5
Tokens: 111
Current i: 98
Current token: 14.619999885559082
14.62
Tokens: 111
Current i: 99
Current token: 15.239999771118164
15.24
Tokens: 111
Current i: 100
Current token: 15.640000343322754
15.64
Tokens: 111
Current i: 101
Current token: 15.720000267028809
15.72
Tokens: 111
Current i: 102
Current token: 15.720000267028809
15.72
Tokens: 111
Current i: 103
Current token: 15.84000015258789
15.84
Tokens: 111
Current i: 104
Current token: 16.139999389648438
16.14
Tokens: 111
Current i: 105
Current token: 16.68000030517578
16.68
Tokens: 111
Current i: 106
Current token: 16.940000534057617
16.94
Tokens: 111
Current i: 107
Current token: 17.100000381469727
17.1
Tokens: 111
Current i: 108
Current token: 17.239999771118164
17.24
Tokens: 111
Current i: 109
Current token: 17.31999969482422
17.32
Tokens: 111
Current i: 110
Current token: 17.31999969482422
17.32
Tokens: 111
Current i: 111

The first thing I noticed was that 'i' starts at 4, instead of 0.
The second thing I noticed is that the last 2 tokens starts at the same time.

I've tried with a couple of fixed found around the community, like setting batch_size to 1, but to no avail.

Does anybody have equal experience, or know how to fix this issue?

I've made a temporary fix, that seems to work.

In tokenization_whisper.py, I've changed the following:
_collate_word_timestamps function has been changed to:

def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language):
    words, _, token_indices = _combine_tokens_into_words(tokenizer, tokens, language)
    print(f'Indicies: {token_indices}')
    timings = []

    for word, indices in zip(words, token_indices):
        # Check if indices are within the valid range of token_timestamps
        if indices[0] < len(token_timestamps) and indices[-1] < len(token_timestamps):
            # Print the current indices
            print(f"Current indices: {indices}")
            
            # Append the dictionary to the timings list
            timings.append({
                "text": word,
                "timestamp": (token_timestamps[indices[0]][0], token_timestamps[indices[-1]][1]),
            })
        else:
            print(f"Skipping indices: {indices} (out of range)")
            print(f'Tokens: {len(token_timestamps)}')
            for ind in indices:
                print(f'{ind}: {token_timestamps[ind]}')
                ind_start_time = token_timestamps[ind][0]
                ind_end_time = token_timestamps[ind][1]

                if ind_end_time is None:
                    # We've reached the final timestamp. Lets return the data before index out of range or dublicate endning words
                    return timings

                timings.append({
                    "text": word,
                    "timestamp": (ind_start_time, ind_end_time)
                })
                
    return timings

In _decode_asr function, on line 1021, I've changed the else code to be as follows:

else:
                # 4/ Regular token
                # We just append to the list of all tokens so we can handle
                # merges later and decode into text.
                current_tokens.append(token)
                if return_timestamps == "word":
                    print(f'Tokens: {len(token_timestamps)}')
                    print(f'Current i: {i}')

                    if (i != len(token_timestamps)):

                        print(f'Current token: {token_timestamps[i]}')
                        start_time = round(token_timestamps[i] + time_offset, 2)
                        print(start_time)

                        if i + 1 < len(token_timestamps):
                            end_time = round(token_timestamps[i + 1] + time_offset, 2)
                        else:
                            end_time = None  # should never happen
                        current_token_timestamps.append((start_time, end_time))
                    else:
                        print('Token index outside of range. Skipping')

Now I just need to find a way to "inject" these changes, without editing the site-packages files.

Hey @mkvbn - thanks for reporting! Could you confirm that you're using the latest version of Transformers? If the issue persists, could you please open an issue in Transformers with a reproducible code snippet? E.g. one we can run end-to-end to mimic this issue. Thanks!

I am using transformers 4.42.2, which seems to be the latest version.

I will open an issue on github.

Sign up or log in to comment