1. It loads an example audio file containing a drum recording
2. It computes the STFT of the audio data.
3. It applies NMFD as described in [1], with audio-informed initialization of the components
4. It visualizes the decomposition results.
5. It resynthesizes the separated audio streams and saves them as wav files to the hard drive.
[1] Christian Dittmar, Meinard Müller
Reverse Engineering the Amen Break - Score-informed Separation and
Restoration applied to Drum Recordings
IEEE/ACM Transactions on Audio, Speech, and Language Processing,
24(9): 1531-1543, 2016.
[2] Patricio López-Serrano, Christian Dittmar, Yiğitcan Özer, and Meinard Müller
NMF Toolbox: Music Processing Applications of Nonnegative Matrix Factorization
In Proceedings of the International Conference on Digital Audio Effects (DAFx), 2019.
This file is part of 'NMF toolbox'. 'NMF toolbox' is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 'NMF toolbox' is distributed in the hope that it will be useful, but ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
import os
import numpy as np
import scipy.io.wavfile as wav
import IPython.display as ipd
from NMFtoolbox.forwardSTFT import forwardSTFT
from NMFtoolbox.inverseSTFT import inverseSTFT
from NMFtoolbox.initTemplates import initTemplates
from NMFtoolbox.initActivations import initActivations
from NMFtoolbox.NMFD import NMFD
from NMFtoolbox.alphaWienerFilter import alphaWienerFilter
from NMFtoolbox.visualizeComponentsNMF import visualizeComponentsNMF
from NMFtoolbox.utils import make_monaural, pcmInt16ToFloat32Numpy
inpPath = '../data/'
outPath = 'output/'
# create the output directory if it doesn't exist
if not os.path.isdir(outPath):
os.makedirs(outPath)
# convert wav from int16 to float32
filename = 'runningExample_AmenBreak.wav'
fs, x = wav.read(os.path.join(inpPath, filename))
# make monaural if necessary
x = make_monaural(x)
x = pcmInt16ToFloat32Numpy(x)
# spectral parameters
paramSTFT = dict()
paramSTFT['blockSize'] = 2048
paramSTFT['hopSize'] = 512
paramSTFT['winFunc'] = np.hanning(paramSTFT['blockSize'])
paramSTFT['reconstMirror'] = True
paramSTFT['appendFrame'] = True
paramSTFT['numSamples'] = len(x)
# STFT computation
X, A, P = forwardSTFT(x, paramSTFT)
# get dimensions and time and freq resolutions
numBins, numFrames = X.shape
deltaT = paramSTFT['hopSize'] / fs
deltaF = fs / paramSTFT['blockSize']
# set common parameters
numComp = 3
numIter = 30
numTemplateFrames = 8
# generate initial guess for templates
paramTemplates = dict()
paramTemplates['deltaF'] = deltaF
paramTemplates['numComp'] = numComp
paramTemplates['numBins'] = numBins
paramTemplates['numTemplateFrames'] = numTemplateFrames
initW = initTemplates(paramTemplates,'drums')
# generate initial activations
paramActivations = dict()
paramActivations['numComp'] = numComp
paramActivations['numFrames'] = numFrames
initH = initActivations(paramActivations,'uniform')
# NMFD parameters
paramNMFD = dict()
paramNMFD['numComp'] = numComp
paramNMFD['numFrames'] = numFrames
paramNMFD['numIter'] = numIter
paramNMFD['numTemplateFrames'] = numTemplateFrames
paramNMFD['initW'] = initW
paramNMFD['initH'] = initH
# NMFD core method
nmfdW, nmfdH, nmfdV, divKL, _ = NMFD(A, paramNMFD)
# alpha-Wiener filtering
nmfdA, _ = alphaWienerFilter(A, nmfdV, 1.0)
#visualize
paramVis = dict()
paramVis['deltaT'] = deltaT
paramVis['deltaF'] = deltaF
paramVis['endeSec'] = 3.8
paramVis['fontSize'] = 14
fh1, _ = visualizeComponentsNMF(A, nmfdW, nmfdH, nmfdA, paramVis)
audios = []
# resynthesize results of NMF with soft constraints and score information
for k in range(numComp):
Y = nmfdA[k] * np.exp(1j * P);
y, _ = inverseSTFT(Y, paramSTFT)
audios.append(y)
# save result
out_filepath = os.path.join(outPath,
'Winstons_AmenBreak_NMFD_component_{}.wav'.format(k, filename))
wav.write(filename=out_filepath, rate=fs, data=y)
ipd.Audio(x, rate=fs)
ipd.Audio(audios[0].T, rate=fs)
ipd.Audio(audios[1].T, rate=fs)
ipd.Audio(audios[2].T, rate=fs)