File size: 1,246 Bytes
e4bd7f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import torch
import torchaudio
from PIL import Image
import numpy as np


def load_image(image, image_processor):
    if isinstance(image, str):  # is a image path
        raw_image = Image.open(image).convert('RGB')
        image = image_processor(raw_image).unsqueeze(0)
    elif isinstance(image, Image.Image):
        raw_image = image
        image = image_processor(raw_image).unsqueeze(0)
    elif isinstance(image, torch.Tensor):
        if len(image.shape) == 3:
            image = image.unsqueeze(0)
    return image


def load_audio(audio, audio_processor):
    if isinstance(audio, str):  # is a audio path
        raw_audio = torchaudio.load(audio)
        audio = audio_processor(raw_audio)
    elif isinstance(audio, tuple):
        sample_rate, raw_waveform = audio
        waveform = raw_waveform / np.iinfo(raw_waveform.dtype).max
        if waveform.ndim == 1:
            waveform = torch.from_numpy(waveform[None, :])
        elif waveform.ndim == 2:
            waveform = torch.from_numpy(waveform).mean(1).unsqueeze(0)
        else:
            raise NotImplementedError  # "No such data!"
        audio = audio_processor((waveform, sample_rate))
    else:
        raise NotImplementedError
    return audio.unsqueeze(0)