Kamtera commited on
Commit
ee6552c
1 Parent(s): 8d96194

Update train_vits-2.py

Browse files
Files changed (1) hide show
  1. train_vits-2.py +21 -6
train_vits-2.py CHANGED
@@ -52,9 +52,29 @@ character_config=CharactersConfig(
52
  blank="<BLNK>",
53
  characters_class="TTS.tts.utils.text.characters.IPAPhonemes",
54
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  config = VitsConfig(
56
  audio=audio_config,
57
  run_name="vits_fa_female",
 
58
  batch_size=8,
59
  eval_batch_size=4,
60
  batch_group_size=5,
@@ -83,12 +103,7 @@ config = VitsConfig(
83
  ],
84
  output_path=output_path,
85
  datasets=[audio_config],
86
- d_vector_file=['/kaggle/working/speakers.pth'],
87
- use_d_vector_file=True,
88
- d_vector_dim=512,
89
- num_layers_text_encoder=10,
90
- speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
91
- speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
92
  # Enable the weighted sampler
93
  use_weighted_sampler=True,
94
  # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
 
52
  blank="<BLNK>",
53
  characters_class="TTS.tts.utils.text.characters.IPAPhonemes",
54
  )
55
+
56
+
57
+
58
+ model_args = VitsArgs(
59
+ d_vector_file=['/kaggle/working/speakers.pth'],
60
+ use_d_vector_file=True,
61
+ d_vector_dim=512,
62
+ num_layers_text_encoder=10,
63
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
64
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
65
+ # resblock_type_decoder="2", # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
66
+ # Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
67
+ # use_speaker_encoder_as_loss=True,
68
+ # Usefull parameters to the enable multilingual training
69
+ # use_language_embedding=True,
70
+ # embedded_language_dim=4,
71
+ )
72
+
73
+
74
  config = VitsConfig(
75
  audio=audio_config,
76
  run_name="vits_fa_female",
77
+ model_args=model_args,
78
  batch_size=8,
79
  eval_batch_size=4,
80
  batch_group_size=5,
 
103
  ],
104
  output_path=output_path,
105
  datasets=[audio_config],
106
+
 
 
 
 
 
107
  # Enable the weighted sampler
108
  use_weighted_sampler=True,
109
  # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has