This is the accompanying website for the following paper:
@inproceedings{StrahlM26_MidLevelFusionF0_ICASSP,
author = {Sebastian Strahl and Meinard M{\"u}ller},
title = {Robust And Lightweight {F0} Estimation Through Mid-Level Fusion of {DSP}-Informed Features},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
address = {Barcelona, Spain},
year = {Accepted, 2026},
pages = {},
doi = {},
}
Fundamental frequency (F0) estimation is a key task in audio signal processing, traditionally addressed with digital signal processing (DSP) methods based on spectrogram or autocorrelation analysis. More recent deep learning approaches have improved accuracy and robustness, but often at the cost of high complexity and limited interpretability. We propose a hybrid approach that extracts mid-level features from classical DSP-based methods and fuses them using a neural network, thereby leveraging the strengths of model-based F0 estimators without relying on their hard decisions. Specifically, we use soft time–frequency representations derived from YIN, SWIPE, and the cepstrum alongside spectrograms. While spectrograms contain strong components at the F0 and higher harmonics, the other representations emphasize the F0 and subharmonics, thus providing complementary information. These features are then fused by a lightweight convolutional architecture with 6.5k trainable parameters. Cross-dataset experiments demonstrate that our method yields robust and accurate F0 estimates, achieving competitive performance compared to purely data-driven methods while largely preserving the interpretability of classical approaches.
This work was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Grant No. 500643750 (MU 2686/15-1). The authors are with the International Audio Laboratories Erlangen, a joint institution of the Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer Institute for Integrated Circuits IIS.
@article{CheveigneK02_YIN_JASA,
author = {Alain de Cheveign{\'e} and Hideki Kawahara},
title = {{YIN}, a fundamental frequency estimator for speech and music.},
journal = {Journal of the Acoustical Society of America (JASA)},
year = {2002},
volume = {111},
pages = {1917--1930},
number = {4},
}
@article{CamachoH08_SawtoothWaveform_JASA,
author = {Arturo Camacho and John G. Harris},
title = {A sawtooth waveform inspired pitch estimator for speech and music},
publisher = {ASA},
year = {2008},
journal = {The Journal of the Acoustical Society of America},
volume = {124},
number = {3},
pages = {1638--1652},
}
@inproceedings{KimSLB18_CREPE_ICASSP,
author = {Jong Wook Kim and Justin Salamon and Peter Li and Juan Pablo Bello},
title = {{CREPE}: {A} Convolutional Representation for Pitch Estimation},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech and Signal Processing ({ICASSP})},
address = {Calgary, Canada},
pages = {161--165},
year = {2018},
doi = {10.1109/ICASSP.2018.8461329}
}
@article{StrahlM25_df0_TASLP,
author = {Sebastian Strahl and Meinard M{\"u}ller},
title = {{dYIN} and {dSWIPE}: {D}ifferentiable Variants of Classical Fundamental Frequency Estimators},
journal = {{IEEE} Transactions on Audio, Speech, and Language Processing},
year = {2025},
volume = {33},
pages = {2622--2633},
doi = {10.1109/TASLPRO.2025.3581119},
}