add spn_predictor
Browse files- README.md +18 -8
- config.json +1 -1
- hyperparams.yaml +24 -7
- model.ckpt +0 -3
README.md
CHANGED
|
@@ -16,13 +16,11 @@ metrics:
|
|
| 16 |
<iframe src="https://ghbtns.com/github-btn.html?user=speechbrain&repo=speechbrain&type=star&count=true&size=large&v=2" frameborder="0" scrolling="0" width="170" height="30" title="GitHub"></iframe>
|
| 17 |
<br/><br/>
|
| 18 |
|
| 19 |
-
**IMPORTANT: This is a work in progress. This model is not providing meaningful output at the moment**
|
| 20 |
-
|
| 21 |
# Text-to-Speech (TTS) with FastSpeech2 trained on LJSpeech
|
| 22 |
|
| 23 |
This repository provides all the necessary tools for Text-to-Speech (TTS) with SpeechBrain using a [FastSpeech2](https://arxiv.org/abs/2006.04558) pretrained on [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
| 24 |
|
| 25 |
-
The pre-trained model takes
|
| 26 |
|
| 27 |
|
| 28 |
## Install SpeechBrain
|
|
@@ -46,16 +44,28 @@ from speechbrain.pretrained import HIFIGAN
|
|
| 46 |
|
| 47 |
# Intialize TTS (tacotron2) and Vocoder (HiFIGAN)
|
| 48 |
fastspeech2 = FastSpeech2.from_hparams(source="speechbrain/tts-fastspeech2-ljspeech", savedir="tmpdir_tts")
|
| 49 |
-
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
#
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
# Running Vocoder (spectrogram-to-waveform)
|
| 55 |
waveforms = hifi_gan.decode_batch(mel_output)
|
| 56 |
|
| 57 |
# Save the waverform
|
| 58 |
-
torchaudio.save('
|
| 59 |
```
|
| 60 |
|
| 61 |
If you want to generate multiple sentences in one-shot, you can do in this way:
|
|
@@ -68,7 +78,7 @@ items = [
|
|
| 68 |
"How much wood would a woodchuck chuck?",
|
| 69 |
"Never odd or even"
|
| 70 |
]
|
| 71 |
-
mel_outputs, durations, pitch, energy = fastspeech2.
|
| 72 |
|
| 73 |
```
|
| 74 |
|
|
|
|
| 16 |
<iframe src="https://ghbtns.com/github-btn.html?user=speechbrain&repo=speechbrain&type=star&count=true&size=large&v=2" frameborder="0" scrolling="0" width="170" height="30" title="GitHub"></iframe>
|
| 17 |
<br/><br/>
|
| 18 |
|
|
|
|
|
|
|
| 19 |
# Text-to-Speech (TTS) with FastSpeech2 trained on LJSpeech
|
| 20 |
|
| 21 |
This repository provides all the necessary tools for Text-to-Speech (TTS) with SpeechBrain using a [FastSpeech2](https://arxiv.org/abs/2006.04558) pretrained on [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
|
| 22 |
|
| 23 |
+
The pre-trained model takes texts or phonemes as input and produces a spectrogram in output. One can get the final waveform by applying a vocoder (e.g., HiFIGAN) on top of the generated spectrogram. It should be noted that if the input is text, we use a state-of-the-art grapheme-to-phoneme module to convert it to phonemes and then pass the phonemes to fastspeech2 model.
|
| 24 |
|
| 25 |
|
| 26 |
## Install SpeechBrain
|
|
|
|
| 44 |
|
| 45 |
# Intialize TTS (tacotron2) and Vocoder (HiFIGAN)
|
| 46 |
fastspeech2 = FastSpeech2.from_hparams(source="speechbrain/tts-fastspeech2-ljspeech", savedir="tmpdir_tts")
|
| 47 |
+
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
|
| 48 |
+
|
| 49 |
+
# Run TTS with text input
|
| 50 |
+
input_text = "were the leaders in this luckless change; though our own Baskerville; who was at work some years before them; went much on the same lines;"
|
| 51 |
+
mel_output, durations, pitch, energy = fastspeech2.encode_text([input_text])
|
| 52 |
+
|
| 53 |
+
# Running Vocoder (spectrogram-to-waveform)
|
| 54 |
+
waveforms = hifi_gan.decode_batch(mel_output)
|
| 55 |
+
|
| 56 |
+
# Save the waverform
|
| 57 |
+
torchaudio.save('example_TTS_input_text.wav', waveforms.squeeze(1), 22050)
|
| 58 |
+
|
| 59 |
|
| 60 |
+
# Run TTS with phoneme input
|
| 61 |
+
input_phonemes = ['W', 'ER', 'DH', 'AH', 'L', 'IY', 'D', 'ER', 'Z', 'IH', 'N', 'DH', 'IH', 'S', 'L', 'AH', 'K', 'L', 'AH', 'S', 'CH', 'EY', 'N', 'JH', 'spn', 'DH', 'OW', 'AW', 'ER', 'OW', 'N', 'B', 'AE', 'S', 'K', 'ER', 'V', 'IH', 'L', 'spn', 'HH', 'UW', 'W', 'AA', 'Z', 'AE', 'T', 'W', 'ER', 'K', 'S', 'AH', 'M', 'Y', 'IH', 'R', 'Z', 'B', 'IH', 'F', 'AO', 'R', 'DH', 'EH', 'M', 'spn', 'W', 'EH', 'N', 'T', 'M', 'AH', 'CH', 'AA', 'N', 'DH', 'AH', 'S', 'EY', 'M', 'L', 'AY', 'N', 'Z', 'spn']
|
| 62 |
+
mel_output, durations, pitch, energy = fastspeech2.encode_phoneme([input_phonemes])
|
| 63 |
|
| 64 |
# Running Vocoder (spectrogram-to-waveform)
|
| 65 |
waveforms = hifi_gan.decode_batch(mel_output)
|
| 66 |
|
| 67 |
# Save the waverform
|
| 68 |
+
torchaudio.save('example_TTS_input_phoneme.wav', waveforms.squeeze(1), 22050)
|
| 69 |
```
|
| 70 |
|
| 71 |
If you want to generate multiple sentences in one-shot, you can do in this way:
|
|
|
|
| 78 |
"How much wood would a woodchuck chuck?",
|
| 79 |
"Never odd or even"
|
| 80 |
]
|
| 81 |
+
mel_outputs, durations, pitch, energy = fastspeech2.encode_text(items)
|
| 82 |
|
| 83 |
```
|
| 84 |
|
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"speechbrain_interface": "FastSpeech2",
|
| 3 |
"vocoder_interface": "HiFIGAN",
|
| 4 |
-
"vocoder_model_id": "speechbrain/tts-hifigan-
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"speechbrain_interface": "FastSpeech2",
|
| 3 |
"vocoder_interface": "HiFIGAN",
|
| 4 |
+
"vocoder_model_id": "speechbrain/tts-hifigan-ljspeech"
|
| 5 |
}
|
hyperparams.yaml
CHANGED
|
@@ -45,26 +45,26 @@ lexicon:
|
|
| 45 |
- ZH
|
| 46 |
- spn
|
| 47 |
|
| 48 |
-
n_symbols:
|
| 49 |
padding_idx: 0
|
| 50 |
n_mel_channels: 80
|
| 51 |
|
| 52 |
# Encoder parameters
|
| 53 |
enc_num_layers: 4
|
| 54 |
enc_num_head: 2
|
| 55 |
-
enc_d_model:
|
| 56 |
enc_ffn_dim: 1024
|
| 57 |
-
enc_k_dim:
|
| 58 |
-
enc_v_dim:
|
| 59 |
enc_dropout: 0.2
|
| 60 |
|
| 61 |
# Decoder parameters
|
| 62 |
dec_num_layers: 4
|
| 63 |
dec_num_head: 2
|
| 64 |
-
dec_d_model:
|
| 65 |
dec_ffn_dim: 1024
|
| 66 |
-
dec_k_dim:
|
| 67 |
-
dec_v_dim:
|
| 68 |
dec_dropout: 0.2
|
| 69 |
|
| 70 |
# Postnet parameters
|
|
@@ -84,6 +84,21 @@ pitch_pred_kernel_size: 3
|
|
| 84 |
energy_pred_kernel_size: 3
|
| 85 |
variance_predictor_dropout: 0.5
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
# Model
|
| 88 |
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
|
| 89 |
enc_num_layers: !ref <enc_num_layers>
|
|
@@ -119,8 +134,10 @@ model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
|
|
| 119 |
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
|
| 120 |
|
| 121 |
modules:
|
|
|
|
| 122 |
model: !ref <model>
|
| 123 |
|
| 124 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
| 125 |
loadables:
|
|
|
|
| 126 |
model: !ref <model>
|
|
|
|
| 45 |
- ZH
|
| 46 |
- spn
|
| 47 |
|
| 48 |
+
n_symbols: 42 #fixed deppending on symbols in the lexicon +1 for a dummy symbol used for padding
|
| 49 |
padding_idx: 0
|
| 50 |
n_mel_channels: 80
|
| 51 |
|
| 52 |
# Encoder parameters
|
| 53 |
enc_num_layers: 4
|
| 54 |
enc_num_head: 2
|
| 55 |
+
enc_d_model: 384
|
| 56 |
enc_ffn_dim: 1024
|
| 57 |
+
enc_k_dim: 384
|
| 58 |
+
enc_v_dim: 384
|
| 59 |
enc_dropout: 0.2
|
| 60 |
|
| 61 |
# Decoder parameters
|
| 62 |
dec_num_layers: 4
|
| 63 |
dec_num_head: 2
|
| 64 |
+
dec_d_model: 384
|
| 65 |
dec_ffn_dim: 1024
|
| 66 |
+
dec_k_dim: 384
|
| 67 |
+
dec_v_dim: 384
|
| 68 |
dec_dropout: 0.2
|
| 69 |
|
| 70 |
# Postnet parameters
|
|
|
|
| 84 |
energy_pred_kernel_size: 3
|
| 85 |
variance_predictor_dropout: 0.5
|
| 86 |
|
| 87 |
+
# silent phoneme token predictor
|
| 88 |
+
spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor
|
| 89 |
+
enc_num_layers: !ref <enc_num_layers>
|
| 90 |
+
enc_num_head: !ref <enc_num_head>
|
| 91 |
+
enc_d_model: !ref <enc_d_model>
|
| 92 |
+
enc_ffn_dim: !ref <enc_ffn_dim>
|
| 93 |
+
enc_k_dim: !ref <enc_k_dim>
|
| 94 |
+
enc_v_dim: !ref <enc_v_dim>
|
| 95 |
+
enc_dropout: !ref <enc_dropout>
|
| 96 |
+
normalize_before: !ref <normalize_before>
|
| 97 |
+
ffn_type: !ref <ffn_type>
|
| 98 |
+
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
|
| 99 |
+
n_char: !ref <n_symbols>
|
| 100 |
+
padding_idx: !ref <padding_idx>
|
| 101 |
+
|
| 102 |
# Model
|
| 103 |
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
|
| 104 |
enc_num_layers: !ref <enc_num_layers>
|
|
|
|
| 134 |
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
|
| 135 |
|
| 136 |
modules:
|
| 137 |
+
spn_predictor: !ref <spn_predictor>
|
| 138 |
model: !ref <model>
|
| 139 |
|
| 140 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
| 141 |
loadables:
|
| 142 |
+
spn_predictor: !ref <spn_predictor>
|
| 143 |
model: !ref <model>
|
model.ckpt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:dfc40f7ad123936bb18f56d6f2392198c09b50eb416fdfb0895ec2077f4ed6cc
|
| 3 |
-
size 114702155
|
|
|
|
|
|
|
|
|
|
|
|