File size: 8,404 Bytes
8c92a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
{
    "base_config": "config/svc/base.json",
    "model_type": "VITS",
    "task_type": "svc",
    "preprocess": {
        // Config for features extraction
        "extract_mel": true,
        "extract_pitch": true,
        "pitch_extractor": "parselmouth",
        "extract_energy": true,
        "extract_uv": true,
        "extract_linear_spec": true,
        "extract_audio": true,
        "mel_min_max_norm": true,
        // Config for features usage
        "use_linear": true,
        "use_mel": true,
        "use_min_max_norm_mel": false,
        "use_audio": true,
        "use_frame_pitch": true,
        "use_uv": true,
        "use_spkid": true,
        "use_contentvec": false,
        "use_whisper": false,
        "use_wenet": false,
        "use_text": false,
        "use_phone": false,
        "fmin": 0,
        "fmax": 12000,
        "f0_min": 50,
        "f0_max": 1100,
        // f0_bin in sovits
        "pitch_bin": 256,
        // filter_length in sovits
        "n_fft": 1024,
        // hop_length in sovits
        "hop_size": 256,
        // win_length in sovits
        "win_size": 1024,
        "segment_size": 8192,
        "n_mel": 100,
        "sample_rate": 24000,
        "mel_min_max_stats_dir": "mel_min_max_stats",
        "whisper_dir": "whisper",
        "contentvec_dir": "contentvec",
        "wenet_dir": "wenet",
        "mert_dir": "mert",
        // Meta file
        "train_file": "train.json",
        "valid_file": "test.json",
        "spk2id": "singers.json",
        "utt2spk": "utt2singer"
    },
    "model": {
        "condition_encoder": {
            "merge_mode": "add",
            "input_melody_dim": 1,
            "use_log_f0": true,
            "n_bins_melody": 256,
            "output_melody_dim": 384,
            "input_loudness_dim": 1,
            "use_log_loudness": true,
            "n_bins_loudness": 256,
            "output_loudness_dim": 384,
            "use_whisper": false,
            "use_contentvec": false,
            "use_wenet": false,
            "use_mert": false,
            "whisper_dim": 1024,
            "contentvec_dim": 256,
            "mert_dim": 256,
            "wenet_dim": 512,
            "content_encoder_dim": 384,
            "singer_table_size": 512,
            "output_singer_dim": 384,
            "output_content_dim": 384,
            "use_spkid": true,
            "pitch_max": 1100.0,
            "pitch_min": 50.0,
        },
        "vits": {
            "filter_channels": 256,
            "gin_channels": 256,
            "hidden_channels": 384,
            "inter_channels": 384,
            "kernel_size": 3,
            "n_flow_layer": 4,
            "n_heads": 2,
            "n_layers": 6,
            "n_layers_q": 3,
            "n_speakers": 512,
            "p_dropout": 0.1,
            "use_spectral_norm": false,
        },
        "generator": "hifigan",
        "generator_config": {
            "hifigan": {
                "resblock": "1",
                "resblock_kernel_sizes": [
                    3,
                    7,
                    11
                ],
                "upsample_rates": [
                    8,
                    8,
                    2,
                    2
                ],
                "upsample_kernel_sizes": [
                    16,
                    16,
                    4,
                    4
                ],
                "upsample_initial_channel": 512,
                "resblock_dilation_sizes": [
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ]
                ]
            },
            "melgan": {
                "ratios": [
                    8,
                    8,
                    2,
                    2
                ],
                "ngf": 32,
                "n_residual_layers": 3,
                "num_D": 3,
                "ndf": 16,
                "n_layers": 4,
                "downsampling_factor": 4
            },
            "bigvgan": {
                "resblock": "1",
                "activation": "snakebeta",
                "snake_logscale": true,
                "upsample_rates": [
                    8,
                    8,
                    2,
                    2
                ],
                "upsample_kernel_sizes": [
                    16,
                    16,
                    4,
                    4
                ],
                "upsample_initial_channel": 512,
                "resblock_kernel_sizes": [
                    3,
                    7,
                    11
                ],
                "resblock_dilation_sizes": [
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ]
                ]
            },
            "nsfhifigan": {
                "resblock": "1",
                "harmonic_num": 8,
                "upsample_rates": [
                    8,
                    8,
                    2,
                    2
                ],
                "upsample_kernel_sizes": [
                    16,
                    16,
                    4,
                    4
                ],
                "upsample_initial_channel": 768,
                "resblock_kernel_sizes": [
                    3,
                    7,
                    11
                ],
                "resblock_dilation_sizes": [
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ]
                ]
            },
            "apnet": {
                "ASP_channel": 512,
                "ASP_resblock_kernel_sizes": [
                    3,
                    7,
                    11
                ],
                "ASP_resblock_dilation_sizes": [
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ]
                ],
                "ASP_input_conv_kernel_size": 7,
                "ASP_output_conv_kernel_size": 7,
                "PSP_channel": 512,
                "PSP_resblock_kernel_sizes": [
                    3,
                    7,
                    11
                ],
                "PSP_resblock_dilation_sizes": [
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ],
                    [
                        1,
                        3,
                        5
                    ]
                ],
                "PSP_input_conv_kernel_size": 7,
                "PSP_output_R_conv_kernel_size": 7,
                "PSP_output_I_conv_kernel_size": 7,
            }
        },
    },
    "train": {
        "fp16_run": true,
        "learning_rate": 2e-4,
        "betas": [
            0.8,
            0.99
        ],
        "eps": 1e-9,
        "batch_size": 16,
        "lr_decay": 0.999875,
        // "segment_size": 8192,
        "init_lr_ratio": 1,
        "warmup_epochs": 0,
        "c_mel": 45,
        "c_kl": 1.0,
        "AdamW": {
            "betas": [
                0.8,
                0.99
            ],
            "eps": 1e-9,
        }
    }
}