This is the accompanying website for the paper "A Comparison of Recent Neural Vocoders for Speech Signal Reconstruction" by Prachi Govalkar, Johannes Fischer, Frank Zalkow, and Christian Dittmar.
In recent years, text-to-speech (TTS) synthesis has benefited from advanced machine learning approaches. Most prominently, since the introduction of the WaveNet [3] architecture, neural vocoders have exhibited superior performance in terms of the naturalness of synthesized speech signals in comparison to traditional vocoders. In this paper, a fair comparison of recent neural vocoders is presented in a signal reconstruction scenario. That means we use such techniques to resynthesize speech waveforms from mel-scaled spectrograms, a compact and generally non-invertible representation of the underlying audio signal. In that context, we conduct listening tests according to the well established MUSHRA standard [1,2] and compare the attained results to similar studies [7,14,15,16]. Weighing off the perceptual quality to the computational requirements, our findings shall serve as a guideline to both practitioners and researchers in speech synthesis.
Below, we provide one item from our test set in all different MUSHRA test conditions. With the integrated player, it is possible to switch seamlessly between the different conditions and get a visualization of the corresponding mel-scaled spectrogram at the same time. The mel-scaled spectrogram of the reference recording is always shown on top, directly below is the mel-scaled spectrogram derived from the synthesized signal. When switching between the conditions, it is interesting to look at the visual differences in the time-frequency domain as well.
Below, we provide the same test item in two additional conditions that were not included in our listening test. Here, the PGHI and SPSI phase reconstruction methods were applied to the original magnitude spectrogram, omitting the round trip to the mel-scale and back.
The audio examples used on this page are given for educational purposes only. If any legal problems occur, please contact us. Any content that allegedly infringes the copyright of a third party will be removed upon request by the copyright holder.
WNET | NVWN | FFTN | WRNN | WGLO | LPCN | |
---|---|---|---|---|---|---|
sampling_rate | 22050 | 16000 | 22050 | 16000 | 22050 | 16000 |
segment_length | - | 16000 | 5000 | 1375 | 16000 | 160 |
hop_size | 256 | 200 | - | 275 | 256 | 160 |
fft_size | 1024 | 800 | - | 1100 | 1024 | - |
window_size | - | 800 | - | - | 1024 | 320 |
num_mels | 80 | 80 | 80 | 80 | 80 | - |
mel_fmin | 125 | 30 | 125 | 40 | 0 | - |
mel_fmax | 7600 | 8000 | 7600 | - | 8000 | - |
input_type | raw | mulaw-quantized | mulaw-quantized | mulaw-quantized | raw | mulaw-quantized |
Output_bit_depth | 16 | 8 | 8 | 8 | 16 | 8 |
framework | PyTorch | PyTorch/CUDA | Tensorflow | PyTorch | PyTorch | C/Keras |
num_epochs | 2000 | 100000 | 2000 | 1000 | 100000 | 120 |
batch_size | 2 | 8 | 8 | 64 | 12 | 64 |
initial_lr | 1e-3 | 1e-3 | 1e-3 | 1e-4 | 1e-4 | 1e-4 |
optimizer | Adam | Adam | Exponential Moving Average | Adam | Adam | AMSGrad |
@misc{ITU_15, author = {{I}nternational {T}elecommunications {U}nion}, title = {{ITU-R} {R}ec. {BS}.1534-3: Method for the subjective assessment of intermediate quality levels of coding systems}, year = {2015}, }
@conference{SchoefflerSEH15_WebMUSHRA_WAC, address = {Paris, France}, author = {Michael Schoeffler and Fabian-Robert St\"{o}ter and Bernd Edler and J\"{u}rgen Herre}, booktitle = {Proceedings of the Web Audio Conference}, month = {January}, title = {{Towards the Next Generation of Web-based Experiments: A Case Study Assessing Basic Audio Quality Following the ITU-R Recommendation BS.1534 (MUSHRA)}}, year = {2015}, }
@inproceedings{OordDZSVGKSK16_WaveNet_SSW, author = {A{\"{a}}ron van den Oord and Sander Dieleman and Heiga Zen and Karen Simonyan and Oriol Vinyals and Alex Graves and Nal Kalchbrenner and Andrew W. Senior and Koray Kavukcuoglu}, title = {WaveNet: {A} Generative Model for Raw Audio}, booktitle = {Proceedings of the {ISCA} Speech Synthesis Workshop}, address = {Sunnyvale, CA, USA}, date = {September}, pages = {125}, year = {2016}, }
@inproceedings{OordLBSVKDLCSCG18_ParallelWaveNet_ICML, author = {A{\"{a}}ron van den Oord and Yazhe Li and Igor Babuschkin and Karen Simonyan and Oriol Vinyals and Koray Kavukcuoglu and George van den Driessche and Edward Lockhart and Luis C. Cobo and Florian Stimberg and Norman Casagrande and Dominik Grewe and Seb Noury and Sander Dieleman and Erich Elsen and Nal Kalchbrenner and Heiga Zen and Alex Graves and Helen King and Tom Walters and Dan Belov and Demis Hassabis}, title = {Parallel WaveNet: Fast High-Fidelity Speech Synthesis}, booktitle = {Proceedings of the International Conference on Machine Learning ({ICML})}, address = {Stockholm, Sweden}, date = {July}, pages = {3915--3923}, year = {2018}, }
@inproceedings{PrengerVC19_WaveGlow_ICASSP, author={Ryan Prenger and Raffael Valle and Bryan Catanzaro}, booktitle={Proceedings of the {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})}, title={{W}ave{G}low: {A} {F}low-based {G}enerative {N}etwork for {S}peech {S}ynthesis}, year={2019}, address = {Brighton, UK}, month = {May}, pages={3617--3621}, }
@inproceedings{KalchbrennerESNCLSODK18_WaveRNN_ICML, author = {Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and A{\"{a}}ron van den Oord and Sander Dieleman and Koray Kavukcuoglu}, title = {Efficient Neural Audio Synthesis}, booktitle = {Proceedings of the International Conference on Machine Learning ({ICML})}, address = {Stockholm, Sweden}, date = {July}, pages = {2415--2424}, year = {2018}, }
@inproceedings{ValinSkoglund19_LPCNet_ICASSP, author = {Jean{-}Marc Valin and Jan Skoglund}, title = {{LPCN}et: Improving Neural Speech Synthesis Through Linear Prediction}, booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})}, address = {Brighton, UK}, month = {May}, year = {2019}, }
@misc{Pharris18_NvWaveNet_Online, author = {Brian Pharris}, title = {Nv-Wavenet: Better Speech Synthesis Using GPU-Enabled WaveNet Inference}, year = {2018}, howpublished = {\url{https://devblogs.nvidia.com/nv-wavenet-gpu-speech-synthesis/}}, }
@inproceedings{ArikCCDGKLMNRSS17_DeepVoice_ICML, author = {Sercan {\"{O}}mer Arik and Mike Chrzanowski and Adam Coates and Gregory Frederick Diamos and Andrew Gibiansky and Yongguo Kang and Xian Li and John Miller and Andrew Y. Ng and Jonathan Raiman and Shubho Sengupta and Mohammad Shoeybi}, title = {Deep Voice: Real-time Neural Text-to-Speech}, booktitle = {Proceedings of the International Conference on Machine Learning ({ICML})}, address = {Sydney, NSW, Australia}, date = {August}, pages = {195--204}, year = {2017}, }
@article{PrusaBS17_PhaseReconstructionPGHI_IEEE-TASLP, author = {Zdenek Prusa and Peter Bal{\'{a}}zs and Peter L. Sondergaard}, title = {A Noniterative Method for Reconstruction of Phase From {STFT} Magnitude}, journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing}, volume = {25}, number = {5}, pages = {1154--1164}, year = {2017}, }
@inproceedings{PrusaSondergaard16_PhaseReconstructionRTPGHI_DAFx, author = {Zdenek Prusa and Peter L. Sondergaard}, booktitle = {Proceedings of the International Conference on Digital Audio Effects ({DAFx})}, title = {Real-Time Spectrogram Inversion Using Phase Gradient Heap Integration}, year = {2016}, month = {September}, address = {Brno, Czech Republic}, pages = {17--21}, }
@inproceedings{BeauregardHW15_PhaseReconstructionSPSI_DSP, author = {Gerald T. Beauregard and Mithila Harish and Lonce Wyse}, title = {Single Pass Spectrogram Inversion}, booktitle = {Proceedings of the {IEEE} International Conference on Digital Signal Processing ({DSP})}, address = {Singapore}, month = {July}, pages = {427--431}, year = {2015}, doi = {10.1109/ICDSP.2015.7251907}, }
@inproceedings{JinFML18_FFTNet_ICASSP, author = {Zeyu Jin and Adam Finkelstein and Gautham J. Mysore and Jingwan Lu}, title = {{FFTN}et: {A} Real-Time Speaker-Dependent Neural Vocoder}, booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})}, address = {Calgary, Canada}, month = {April}, pages = {2251--2255}, year = {2018}, }
@inproceedings{KleijnLLSSWW18_WaveNet_SpeechCoding_ICASSP, author = {W. Bastiaan Kleijn and Felicia S. C. Lim and Alejandro Luebs and Jan Skoglund and Florian Stimberg and Quan Wang and Thomas C. Walters}, title = {Wavenet Based Low Rate Speech Coding}, booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})}, address = {Calgary, Canada}, month = {April}, pages = {676--680}, year = {2018}, }
@inproceedings{WangLTJY18_NeuralVocoders_ICASSP, author = {Xin Wang and Jaime Lorenzo{-}Trueba and Shinji Takaki and Lauri Juvela and Junichi Yamagishi}, title = {A Comparison of Recent Waveform Generation and Acoustic Modeling Methods for Neural-Network-Based Speech Synthesis}, booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})}, address = {Calgary, Canada}, month = {April}, pages = {4804--4808}, year = {2018}, }
@article{AiraksinenJBYA18_VocoderComparion_IEEE-TASLP, author = {Manu Airaksinen and Lauri Juvela and Bajibabu Bollepalli and Junichi Yamagishi and Paavo Alku}, title = {A Comparison Between STRAIGHT, Glottal, and Sinusoidal Vocoding in Statistical Parametric Speech Synthesis}, journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing}, volume = {26}, number = {9}, pages = {1658--1670}, year = {2018}, }