File size: 3,998 Bytes
12bfd03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('../../')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Init model and load weights\n",
      "Model ready\n",
      "Globbed 12 wav files.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 11.08it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "wav.shape: (97681,)\n",
      "acoustic_token: tensor([[[ 11, 591, 281, 629],\n",
      "         [733, 591, 401, 139],\n",
      "         [500, 591, 733, 600],\n",
      "         ...,\n",
      "         [733, 591, 451, 346],\n",
      "         [733, 591, 401, 139],\n",
      "         [386, 591, 281, 461]]], device='cuda:0')\n",
      "acoustic_token.shape: torch.Size([1, 305, 4])\n",
      "acoustic_token.dtype: torch.int64\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import glob\n",
    "import json\n",
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "import librosa\n",
    "import torch\n",
    "from academicodec.models.hificodec.vqvae import VQVAE\n",
    "from librosa.util import normalize\n",
    "from tqdm import tqdm\n",
    "\n",
    "ckpt_path = './checkpoint/HiFi-Codec-24k-320d'\n",
    "config_path = './config_24k_320d.json'\n",
    "with open(config_path, 'r') as f:\n",
    "    config = json.load(f)\n",
    "    sample_rate = config['sampling_rate']\n",
    "\n",
    "outputdir = './output'\n",
    "inputdir = './test_wav'\n",
    "num = 1024\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    Path(outputdir).mkdir(parents=True, exist_ok=True)\n",
    "    print(\"Init model and load weights\")\n",
    "    # make sure you downloaded the weights from https://huggingface.co/Dongchao/AcademiCodec/blob/main/HiFi-Codec-24k-320d \n",
    "    # and put it in ./checkpoint/\n",
    "    model = VQVAE(\n",
    "        config_path,\n",
    "        ckpt_path,\n",
    "        with_encoder=True)\n",
    "    model.cuda()\n",
    "    model.eval()\n",
    "    print(\"Model ready\")\n",
    "\n",
    "    wav_paths = glob.glob(f\"{inputdir}/*.wav\")[:num]\n",
    "    print(f\"Globbed {len(wav_paths)} wav files.\")\n",
    "    fid_to_acoustic_token = {}\n",
    "    for wav_path in tqdm(wav_paths[:1]):\n",
    "        wav, sr = librosa.load(wav_path, sr=sample_rate)\n",
    "        print(\"wav.shape:\",wav.shape)\n",
    "        assert sr == sample_rate\n",
    "        fid = os.path.basename(wav_path)[:-4]\n",
    "        wav = normalize(wav) * 0.95\n",
    "        wav = torch.FloatTensor(wav).unsqueeze(0)\n",
    "        wav = wav.to(torch.device('cuda'))\n",
    "        acoustic_token = model.encode(wav)\n",
    "        print(\"acoustic_token:\",acoustic_token)\n",
    "        print(\"acoustic_token.shape:\",acoustic_token.shape)\n",
    "        print(\"acoustic_token.dtype:\",acoustic_token.dtype)\n",
    "        fid = os.path.basename(wav_path)[:-4]\n",
    "        fid_to_acoustic_token[fid] = acoustic_token\n",
    "\n",
    "    torch.save(fid_to_acoustic_token,\n",
    "               os.path.join(outputdir, 'fid_to_acoustic_token.pth'))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}