RoyalCities commited on
Commit
a16674b
1 Parent(s): 95e5fac

Upload 3 files

Browse files
RC_Vocal_Textures_Full_version.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20dedc588c83d8d65eb6375f6890a339f19f9bc6bb785a6230adf2bfe7c7a117
3
+ size 4854122382
RC_Vocal_Textures_Small_version.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebd6a78509508702d57bd75f8467caa85caf0efe9e265e16968d4ee7e16249e6
3
+ size 2427165507
model_config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "diffusion_cond",
3
+ "sample_size": 882000,
4
+ "sample_rate": 44100,
5
+ "audio_channels": 2,
6
+ "model": {
7
+ "pretransform": {
8
+ "type": "autoencoder",
9
+ "iterate_batch": true,
10
+ "config": {
11
+ "encoder": {
12
+ "type": "oobleck",
13
+ "requires_grad": false,
14
+ "config": {
15
+ "in_channels": 2,
16
+ "channels": 128,
17
+ "c_mults": [1, 2, 4, 8, 16],
18
+ "strides": [2, 4, 4, 8, 8],
19
+ "latent_dim": 128,
20
+ "use_snake": true
21
+ }
22
+ },
23
+ "decoder": {
24
+ "type": "oobleck",
25
+ "config": {
26
+ "out_channels": 2,
27
+ "channels": 128,
28
+ "c_mults": [1, 2, 4, 8, 16],
29
+ "strides": [2, 4, 4, 8, 8],
30
+ "latent_dim": 64,
31
+ "use_snake": true,
32
+ "final_tanh": false
33
+ }
34
+ },
35
+ "bottleneck": {
36
+ "type": "vae"
37
+ },
38
+ "latent_dim": 64,
39
+ "downsampling_ratio": 2048,
40
+ "io_channels": 2
41
+ }
42
+ },
43
+ "conditioning": {
44
+ "configs": [
45
+ {
46
+ "id": "prompt",
47
+ "type": "t5",
48
+ "config": {
49
+ "t5_model_name": "t5-base",
50
+ "max_length": 128
51
+ }
52
+ },
53
+ {
54
+ "id": "seconds_start",
55
+ "type": "number",
56
+ "config": {
57
+ "min_val": 0,
58
+ "max_val": 512
59
+ }
60
+ },
61
+ {
62
+ "id": "seconds_total",
63
+ "type": "number",
64
+ "config": {
65
+ "min_val": 0,
66
+ "max_val": 512
67
+ }
68
+ }
69
+ ],
70
+ "cond_dim": 768
71
+ },
72
+ "diffusion": {
73
+ "cross_attention_cond_ids": ["prompt", "seconds_start", "seconds_total"],
74
+ "global_cond_ids": ["seconds_start", "seconds_total"],
75
+ "type": "dit",
76
+ "config": {
77
+ "io_channels": 64,
78
+ "embed_dim": 1536,
79
+ "depth": 24,
80
+ "num_heads": 24,
81
+ "cond_token_dim": 768,
82
+ "global_cond_dim": 1536,
83
+ "project_cond_tokens": false,
84
+ "transformer_type": "continuous_transformer"
85
+ }
86
+ },
87
+ "io_channels": 64
88
+ },
89
+ "training": {
90
+ "use_ema": true,
91
+ "log_loss_info": false,
92
+ "optimizer_configs": {
93
+ "diffusion": {
94
+ "optimizer": {
95
+ "type": "AdamW",
96
+ "config": {
97
+ "lr": 5e-5,
98
+ "betas": [0.9, 0.999],
99
+ "weight_decay": 1e-3
100
+ }
101
+ },
102
+ "scheduler": {
103
+ "type": "InverseLR",
104
+ "config": {
105
+ "inv_gamma": 1000000,
106
+ "power": 0.5,
107
+ "warmup": 0.99
108
+ }
109
+ }
110
+ }
111
+ },
112
+ "demo": {
113
+ "demo_every": 150,
114
+ "demo_steps": 250,
115
+ "num_demos": 4,
116
+ "demo_cond": [
117
+ {"prompt": "Ensemble Vocal Texture, chord progression, G# major, 128BPM, 8 bars", "seconds_start": 0, "seconds_total": 15},
118
+ {"prompt": "Female Vocal Texture, chord progression, D# minor, 110BPM, 8 bars", "seconds_start": 0, "seconds_total": 17},
119
+ {"prompt": "Male Vocal Texture, chord progression, C# major, 140BPM, 4 bars", "seconds_start": 0, "seconds_total": 7},
120
+ {"prompt": "Ensemble Vocal Texture, chord progression, G# major, 128BPM, 8 bars", "seconds_start": 0, "seconds_total": 15}
121
+ ],
122
+ "demo_cfg_scales": [4, 7]
123
+ }
124
+ }
125
+ }