Update README.md
Browse files
README.md
CHANGED
|
@@ -22,7 +22,10 @@ pip install numpy torch torchaudio einops transformers efficientnet_pytorch
|
|
| 22 |
import torch
|
| 23 |
from transformers import AutoModel, PreTrainedTokenizerFast
|
| 24 |
import torchaudio
|
|
|
|
|
|
|
| 25 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 26 |
# use the model trained on AudioCaps
|
| 27 |
model = AutoModel.from_pretrained(
|
| 28 |
"wsntxxn/effb2-trm-audiocaps-captioning",
|
|
@@ -31,6 +34,7 @@ model = AutoModel.from_pretrained(
|
|
| 31 |
tokenizer = PreTrainedTokenizerFast.from_pretrained(
|
| 32 |
"wsntxxn/audiocaps-simple-tokenizer"
|
| 33 |
)
|
|
|
|
| 34 |
# inference on a single audio clip
|
| 35 |
wav, sr = torchaudio.load("/path/to/file.wav")
|
| 36 |
wav = torchaudio.functional.resample(wav, sr, model.config.sample_rate)
|
|
@@ -43,14 +47,18 @@ with torch.no_grad():
|
|
| 43 |
)
|
| 44 |
caption = tokenizer.decode(word_idxs[0], skip_special_tokens=True)
|
| 45 |
print(caption)
|
|
|
|
| 46 |
# inference on a batch
|
| 47 |
wav1, sr1 = torchaudio.load("/path/to/file1.wav")
|
| 48 |
wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
|
| 49 |
wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]
|
|
|
|
| 50 |
wav2, sr2 = torchaudio.load("/path/to/file2.wav")
|
| 51 |
wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
|
| 52 |
wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]
|
|
|
|
| 53 |
wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)
|
|
|
|
| 54 |
with torch.no_grad():
|
| 55 |
word_idxs = model(
|
| 56 |
audio=wav_batch,
|
|
|
|
| 22 |
import torch
|
| 23 |
from transformers import AutoModel, PreTrainedTokenizerFast
|
| 24 |
import torchaudio
|
| 25 |
+
|
| 26 |
+
|
| 27 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 28 |
+
|
| 29 |
# use the model trained on AudioCaps
|
| 30 |
model = AutoModel.from_pretrained(
|
| 31 |
"wsntxxn/effb2-trm-audiocaps-captioning",
|
|
|
|
| 34 |
tokenizer = PreTrainedTokenizerFast.from_pretrained(
|
| 35 |
"wsntxxn/audiocaps-simple-tokenizer"
|
| 36 |
)
|
| 37 |
+
|
| 38 |
# inference on a single audio clip
|
| 39 |
wav, sr = torchaudio.load("/path/to/file.wav")
|
| 40 |
wav = torchaudio.functional.resample(wav, sr, model.config.sample_rate)
|
|
|
|
| 47 |
)
|
| 48 |
caption = tokenizer.decode(word_idxs[0], skip_special_tokens=True)
|
| 49 |
print(caption)
|
| 50 |
+
|
| 51 |
# inference on a batch
|
| 52 |
wav1, sr1 = torchaudio.load("/path/to/file1.wav")
|
| 53 |
wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
|
| 54 |
wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]
|
| 55 |
+
|
| 56 |
wav2, sr2 = torchaudio.load("/path/to/file2.wav")
|
| 57 |
wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
|
| 58 |
wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]
|
| 59 |
+
|
| 60 |
wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)
|
| 61 |
+
|
| 62 |
with torch.no_grad():
|
| 63 |
word_idxs = model(
|
| 64 |
audio=wav_batch,
|