This is the accompanying website for the following paper:
@inproceedings{KalitaDSZHP24_PAD-VC_IWAENC, author = {Arunava Kr. Kalita and Christian Dittmar and Paolo Sani and Frank Zalkow and Emanu\"{e}l A. P. Habets and Rusha Patra}, title = {{PAD-VC}: {A} Prosody-Aware Decoder for Any-to-Few Voice Conversion}, booktitle = {Proceedings of the International Workshop on Acoustic Signal Enhancement ({IWAENC})}, address = {Aalborg, Denmark}, year = {2024}, pages = {389--393}, doi = {10.1109/IWAENC61483.2024.10694576}, url-pdf = {https://ieeexplore.ieee.org/document/10694576}, url-details = {https://www.audiolabs-erlangen.de/resources/NLUI/2024-PAD-VC}, }
Voice conversion (VC) is the process of synthetically generating speech based on some source speaker recording, aiming to preserve its linguistic information while using a specified target speaker's timbral characteristics. In this paper, we propose PAD-VC, a prosody-aware VC model based on the decoder part of the ForwardTacotron (FT) architecture. We train PAD-VC with prosody features such as pitch, energy, and voicing confidence and augment those with linguistic features derived from a phoneme posteriorgram (PPG) representation of the source utterance. This way, we can handle both phonemic information and frame-wise supra-segmental features. During inference time, the source speaker's prosody features are modified to match the prosody statistics of the target speaker. We show that our proposed PAD-VC surpasses the prosody-cloning performance of FT on unseen source speakers in terms of similarity and naturalness.
Here, we provide audio samples used in our test on speech naturalness, i.e., two synthetic versions (FT-VC and PAD-VC) and a reference obtained by copy synthesis (REF) for eight different text prompts, respectively. In addition, we provide samples for PAD-VC + PostProGAN, where a GAN-based post-processing [5] has been applied to the mel spectrograms predicted by PAD-VC. This condition was not included in the listening tests. Please note that the synthetic versions correspond to a different speaker identity than the reference in this test on speech naturalness.
Here, we provide audio samples used in our test on speaker similarity, i.e., two synthetic versions (FT-VC and PAD-VC) and a reference obtained by copy synthesis (REF) corresponding to the same speaker identity. In addition, we provide samples for PAD-VC + PostProGAN, where a GAN-based post-processing [5] has been applied to the mel spectrograms predicted by PAD-VC. This condition was not included in the listening tests. Please note that the synthetic versions and the reference correspond to different text prompts in this test on speaker similarity.
The author conducted this work during his stay in Fraunhofer IIS, Erlangen, with the TTS group. This internship was made possible through the generous support and funding provided by IGSTC. This research was partially supported by the Free State of Bavaria in the DSAI project, and it was also supported by Fraunhofer-Zukunftsstiftung.
@inproceedings{ChurchwellEtAl24_NeuralPPG_ICASSP, title = {High-Fidelity Neural Phonetic Posteriorgrams}, author = {Cameron Churchwell and Max Morrison and Bryan Pardo}, year = {2024}, booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association (Interspeech)}, address = {Seoul, Korea}, pages = {4287--4291} }
@misc{Schaefer20_ForwardTacotron_Github, author = {Christian Schäfer and Ollie McCarthy and contributors}, howpublished = {\url{https://github.com/as-ideas/ForwardTacotron}}, journal = {GitHub repository}, publisher = {GitHub}, title = {{ForwardTacotron}}, year = {2020} }
@article{SismanEtAl20_VoiceConversionverview_TASLP, title = {An overview of voice conversion and its challenges: {F}rom statistical modeling to deep learning}, author={Berrak Sisman and Junichi Yamagishi and Simon King and Haizhou Li}, journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, volume={29}, pages={132--157}, year={2020}, }
@inproceedings{SunEtAl16_PPG-VoiceConversion_ICME, address = {Seattle, WA, USA}, author = {Lifa Sun and Kun Li and Hao Wang and Shiyin Kang and Helen Meng}, booktitle = {Proceedings of the {IEEE} International Conference on Multimedia and Expo ({ICME})}, pages = {}, title = {Phonetic posteriorgrams for many-to-one voice conversion without parallel data training}, year = {2016}, }
@inproceedings{SaniEtAl23_PostProcessingGAN_ITG, address = {Aachen, Germany}, author = {Paolo Sani and Judith Bauer and Frank Zalkow and Emanu{\"e}l A. P. Habets and Christian Dittmar}, booktitle = {Proceedings of the {ITG} Conference on Speech Communication}, pages = {270--274}, title = {Improving the Naturalness of Synthesized Spectrograms for {TTS} Using {GAN}-Based Post-Processing}, year = {2023} }