AlexHung29629 commited on
Commit
8cf2fc3
1 Parent(s): 16beaf9

Create audio_processing_mllama.py

Browse files
Files changed (1) hide show
  1. audio_processing_mllama.py +85 -0
audio_processing_mllama.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Dict, List, Optional, Union
3
+ import numpy as np
4
+ import torch
5
+ from transformers.tokenization_utils_base import AudioInput
6
+ from transformers.models.seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
7
+ from transformers.utils import TensorType
8
+ from transformers.feature_extraction_utils import BatchFeature
9
+
10
+
11
+
12
+ def make_list_of_audio_clips(audio: AudioInput) -> List[List[Optional[np.ndarray]]]:
13
+ """
14
+ Convert a single audio clip or a list of audio clips to a list of numpy arrays.
15
+
16
+ Args:
17
+ audio (`AudioInput`):
18
+ A single audio or a list of audio clips.
19
+
20
+ Returns:
21
+ A list of numpy arrays.
22
+ """
23
+ # If it's a single audil clip, convert it to a list of lists
24
+ if not isinstance(audio, (list, tuple)):
25
+ output = [[audio]]
26
+
27
+ else:
28
+ if all(isinstance(audio_i, (list, tuple)) for audio_i in audio):
29
+ # If it's a list of batches, it's already in the right format
30
+ output = audio
31
+ else:
32
+ # If it's a list of audio clips, it's a single batch, so convert it to a list of lists
33
+ output = [audio]
34
+
35
+ return output
36
+
37
+ def build_audio_tokens(encoding: Dict, audio_features: List[List[np.ndarray]], audio_token_id: int) -> Dict:
38
+ bs = len(audio_features)
39
+ for i in range(bs):
40
+ for j in range(len(audio_features[i])):
41
+ token_id = -1 - j
42
+ pos = encoding['input_ids'][i].index(audio_token_id)
43
+ encoding['input_ids'][i] = encoding['input_ids'][i][:pos] \
44
+ + [token_id] * get_num_embeddings(audio_features[i][j].size(0)) \
45
+ + encoding['input_ids'][i][pos+1:]
46
+ encoding['attention_mask'][i] = [1] * len(encoding['input_ids'][i])
47
+ return encoding
48
+
49
+ def get_num_embeddings(num_framses, adapter_kernel_size=7, adapter_stride=4) -> int:
50
+ return math.ceil((num_framses - adapter_kernel_size) / adapter_stride) + 1 + 2 # 2 = <|begin_of_audio|>, <|end_of_audio|>
51
+
52
+
53
+ class MllamaAudioFeatureExtractor(SeamlessM4TFeatureExtractor):
54
+
55
+ def __call__(
56
+ self,
57
+ batch_audio_clips: List[List[AudioInput]],
58
+ return_tensors: Optional[Union[str, TensorType]] = None,
59
+ ) -> BatchFeature:
60
+ audio_features = [[ super().__call__(audio_j, return_attention_mask=False)['input_features'][0] for audio_j in audio_i ] for audio_i in batch_audio_clips ]
61
+ packed_audio_features = self.pack_audio_clips(audio_features)
62
+
63
+ encoded_audio_inputs = BatchFeature(
64
+ data={
65
+ "audio_features": packed_audio_features,
66
+ },
67
+ tensor_type=return_tensors,
68
+ )
69
+
70
+ return encoded_audio_inputs
71
+
72
+ def pack_audio_clips(batch_audio_clips: List[List[np.ndarray]]) -> np.ndarray:
73
+ assert batch_audio_clips[0][0].ndim == 2 # sequence length x feature dimension
74
+ # Determine output shape: (batch_size, max_num_clips, max_frames, feature_dim)
75
+ batch_size = len(batch_audio_clips)
76
+ max_num_clips = max([len(clips) for clips in batch_audio_clips])
77
+ max_frames = max([clip.size(0) for clips in batch_audio_clips for clip in clips])
78
+ feature_dim = batch_audio_clips[0][0].size(1)
79
+
80
+ stacked_audio_clips = np.zeros((batch_size, max_num_clips, max_frames, feature_dim), dtype=np.float32)
81
+ for i, clips in enumerate(batch_audio_clips):
82
+ for j, clip in enumerate(clips):
83
+ stacked_audio_clips[i, j, :clip.shape[0], :] = clip
84
+
85
+ return stacked_audio_clips