|
defaults: |
|
- base |
|
- [email protected]: dual_ar_2_codebook_small |
|
- _self_ |
|
|
|
project: text2semantic_finetune_dual_ar |
|
max_length: 2048 |
|
ckpt_path: checkpoints/text2semantic-medium-v1-2k.pth |
|
resume_weights_only: true |
|
|
|
|
|
trainer: |
|
accumulate_grad_batches: 1 |
|
gradient_clip_val: 1.0 |
|
gradient_clip_algorithm: 'norm' |
|
max_steps: 1000 |
|
precision: bf16-true |
|
limit_val_batches: 10 |
|
val_check_interval: 100 |
|
|
|
|
|
tokenizer: |
|
_target_: transformers.AutoTokenizer.from_pretrained |
|
pretrained_model_name_or_path: fishaudio/fish-speech-1 |
|
|
|
|
|
train_dataset: |
|
_target_: fish_speech.datasets.text.AutoAugTextDataset |
|
proto_files: |
|
- data/protos |
|
tokenizer: ${tokenizer} |
|
max_length: ${max_length} |
|
num_codebooks: ${model.model.config.num_codebooks} |
|
use_speaker: false |
|
|
|
val_dataset: |
|
_target_: fish_speech.datasets.text.AutoAugTextDataset |
|
proto_files: |
|
- data/protos |
|
tokenizer: ${tokenizer} |
|
max_length: ${max_length} |
|
num_codebooks: ${model.model.config.num_codebooks} |
|
use_speaker: false |
|
|
|
data: |
|
_target_: fish_speech.datasets.text.TextDataModule |
|
train_dataset: ${train_dataset} |
|
val_dataset: ${val_dataset} |
|
num_workers: 4 |
|
batch_size: 8 |
|
tokenizer: ${tokenizer} |
|
max_length: ${max_length} |
|
|
|
|
|
model: |
|
_target_: fish_speech.models.text2semantic.TextToSemantic |
|
model: {} |
|
|
|
optimizer: |
|
_target_: torch.optim.AdamW |
|
_partial_: true |
|
lr: 1e-5 |
|
weight_decay: 0 |
|
betas: [0.9, 0.95] |
|
eps: 1e-5 |
|
|
|
lr_scheduler: |
|
_target_: torch.optim.lr_scheduler.LambdaLR |
|
_partial_: true |
|
lr_lambda: |
|
_target_: fish_speech.scheduler.get_cosine_schedule_with_warmup_lr_lambda |
|
_partial_: true |
|
num_warmup_steps: 100 |
|
num_training_steps: ${trainer.max_steps} |
|
|
|
|
|
callbacks: |
|
model_checkpoint: |
|
every_n_train_steps: 100 |
|
|