Course 8: Learning with Music Signals
Main Tutor/Lecturer: Simon Schwär, Hans-Ulrich Berendes, Prof. Dr. Meinard Müller
A vocoder (voice encoder) refers to a technology used to analyze and synthesize human voice by decomposing sound into its spectral envelope and excitation signal. Vocoder models include source-filter models inspired by the human vocal tract and sinusoidal models that combine time-varying sine waves. Sinusoidal modeling represents audio signals as sums of sinusoidal components, capturing the harmonic structure of sounds. Recently, Differentiable Digital Signal Processing (DDSP) has been introduced as a framework that merges traditional DSP with deep learning to enhance audio synthesis capabilities. By integrating classic signal processing with modern machine learning, DDSP improves flexibility, accuracy, and expressivity in audio applications. In this group, we explore recent advances in DDSP, specifically for synthesizing singing.
Another promising direction for music synthesis is a class of generative models called "Denosing Diffusion Probabilistic Models" (DDPMs), or just "Diffusion Models" for short. Some well-known examples of this model class have been introduced in the context of image generation (e.g. StableDiffusion, DALL-E), but lately DDPMS have gained increasing attention for music and singing synthesis as well. In contrast to DDSP-based models, DDPMS make few assumptions about the signals we want to synthesize and instead implicitly learn to model the underlying data distribution of a training set. If there is sufficient interest in this topic, we can have a small subgroup dealing with generative models and in paticular with DDPMs. However, good understanding of probabiltiy theory and deep learning are required.
@article{SerraS90_SinesTransientsNoiseModel_CMJ, author = {Xavier Serra and Julius {Smith III}}, journal = {Computer Music Journal}, number = {4}, pages = {12--24}, publisher = {The MIT Press}, title = {Spectral Modeling Synthesis: A Sound Analysis/Synthesis System Based on a Deterministic Plus Stochastic Decomposition}, volume = {14}, year = {1990}, url-pdf = {1990_SerraS_SpectralModelingSynthesis_CMJ.pdf} }
@inproceedings{EngelHGR20_DifferentiableDSP_ICLR, title = {{DDSP}: Differentiable Digital Signal Processing}, author = {Jesse Engel and Lamtharn Hantrakul and Chenjie Gu and Adam Roberts}, booktitle = {Proceedings of the International Conference on Learning Representations ({ICLR})}, year = {2020}, address = {Virtual}, url-pdf = {2020_EngelHGR_DDSP_ICLR.pdf} }
@article{SchwaerM23_MultiScaleSpecLoss_IEEE-SPL, author = {Simon Schw{\"a}r and Meinard M{\"u}ller}, title = {Multi-Scale Spectral Loss Revisited}, journal = {{IEEE} Signal Processing Letters}, year = {2023}, volume = {30}, number = {}, pages = {1712--1716}, doi = {10.1109/LSP.2023.3333205}, url-pdf = {2023_SchwaerM_MSSLossRevisited_IEEE-SPL.pdf} }
@inproceedings{YuF23_DifferentiableLPC_ISMIR, title = {Singing Voice Synthesis Using Differentiable LPC and Glottal-Flow-Inspired Wavetables}, author = {Chin-Yun Yu and Gy{\"o}rgy Fazekas}, booktitle = {Proceedings of the International Society for Music Information Retrieval Conference ({ISMIR})}, pages = {667--675}, address = {Milano, Italy}, year = {2023}, url-pdf = {2023_YuF_GlottalFlowSVS_ISMIR.pdf} }
@article{HayesSFMS24_DDSPReview_Frontiers, author = {Hayes, Ben and Shier, Jordie and Fazekas, Gy{\"{o}}rgy and McPherson, Andrew and Saitis, Charalampos}, title = {A review of differentiable digital signal processing for music and speech synthesis}, journal = {Frontiers in Signal Processing}, volume = {3}, year = {2024}, doi = {10.3389/frsip.2023.1284100}, issn = {2673-8198}, url-pdf = {2024_HayesSFMS_DDSPReview_Frontiers.pdf} }
@article{SchwaerKFRSM24_LarynxMicSVR_TISMIR, author = {Simon Schw{\"a}r and Michael Krause and Michael Fast and Sebastian Rosenzweig and Frank Scherbaum and Meinard M{\"u}ller}, title = {A Dataset of Larynx Microphone Recordings for Singing Voice Reconstruction}, journal = {Transaction of the International Society for Music Information Retrieval ({TISMIR})}, year = {2024}, volume = {7}, number = {1}, pages = {30--43}, doi = {10.5334/tismir.166}, url-pdf = {2024_SchwaerKFRSM_LarynxMicSVR_TISMIR.pdf} }
@article{Kong2022_DiffWave_arxiv, title={DiffWave: A Versatile Diffusion Model for Audio Synthesis}, author={Zhifeng Kong and Wei Ping and Jiaji Huang and Kexin Zhao and Bryan Catanzaro}, year={2021}, volume={abs/2009.09761}, journal={arXiv}, doi={10.48550/arXiv.2009.09761} url-pdf={2022_Kong_DiffWave_arxiv.pdf} }
@article{Hawthorne2022_MultiInstSynth_arxiv, title={Multi-instrument Music Synthesis with Spectrogram Diffusion}, author={Curtis Hawthorne and Ian Simon and Adam Roberts and Neil Zeghidour and Josh Gardner and Ethan Manilow and Jesse Engel}, year={2022}, journal={arXiv}, volume={abs/2206.05408}, doi={10.48550/arXiv.2206.05408}, url-pdf={2022_Hawthorne_MultiInstSynth_arxiv.pdf} }
@inproceedings{Maman2024_PerformanceConditioning_ICASSP, title={Performance Conditioning for Diffusion-Based Multi-Instrument Music Synthesis}, author={Maman, Ben and Zeitler, Johannes and M{\"u}ller, Meinard and Bermano, Amit H}, booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, pages={5045--5049}, year={2024}, organization={IEEE} url-pdf={2024_Maman_PerformanceConditioning_ICASSP.pdf} }