import numpy as np
import matplotlib.pyplot as plt
import librosa
import IPython.display as ipd
def get_dft(x):
"""
Return the non-redundant real/imaginary components
of the DFT, expressed as amplitudes of sines/cosines
involved
Parameters
----------
x: ndarray(N)
A signal
Returns
-------
cos_sum: ndarray(ceil(N/2)), sin_sums(ndarray(ceil(N/2)))
DFT cosine/sine amplitudes
"""
N = len(x)
t = np.linspace(0, 1, N+1)[0:N]
n_freqs = int(np.ceil(N/2))
f = np.fft.fft(x)
cos_sums = np.real(f)[0:n_freqs]/(N/2)
sin_sums = -np.imag(f)[0:n_freqs]/(N/2)
return cos_sums, sin_sums
Now, we load in an audio file of Prof. Tralie playing a chromatic scale on the violin, going through all 24 halfsteps over two octaves in order
x, sr = librosa.load("chromatic2octaves.wav")
ipd.Audio(x, rate=sr)
When we look at the DFT amplitudes, however, everything gets blurred together, because there are many notes that happen at different times, so it's not easy to pick out any single note
x, sr = librosa.load("chromatic2octaves.wav")
N = len(x)
c, s = get_dft(x)
amps = np.sqrt(c**2 + s**2)
freqs = np.arange(len(c))*sr/len(x)
plt.plot(freqs, amps)
plt.xlabel("Frequency (hz)")
plt.ylabel("Amplitude")
plt.xlim([0, 4000])
ipd.Audio(x, rate=sr)
What we need is something where we compute the DFT on small chunks of audio so we can hone in on regions where the frequencies aren't changing too much. What this leads to is the notion of the Short-Time Fourier Transform, where we perform many DFTs on small chunks of audio. There are two parameteres that go into the STFT:
The window length, which is the length of a chunk of audio in samples that's processed
The hop length, which is the number of samples between windows as we jump from one to the next
Let's define a method that does this. We note that for audio with $N$ samples and an STFT with window length $w$ and hop length $h$, the number of windows is
We also define a special version of the STFT where we look at the amplitude only (no phase), which is referred to as a spectrogram
def specgram(x, w, h, sr):
"""
Compute the "spectrogram"
(amplitudes of the short-time fourier transfrom)
Parameters
----------
x: ndarray(N)
Full audio clip of N samples
w: int
Window length
h: int
Hop length
sr: int
Sample rate
Returns
-------
ndarray(w, nwindows) Spectrogram
"""
N = len(x)
i = 0
nwin = int(np.floor((N-w)/h))+1
n_freqs = int(np.ceil(w/2))
# Make a 2D array
# The rows correspond to frequency bins
# The columns correspond to windows moved forward in time
S = np.zeros((n_freqs, nwin))
# Loop through all of the windows, and put the fourier
# transform amplitudes of each window in its own column
for j in range(nwin):
# Pull out the audio in the jth window
# Ex) First window x[0:w]
# Ex) Second window x[h:h+w]
# Ex) Third window x[2h:2h+w]
xj = x[h*j:h*j+w]
# Do the fourier transform of the jth window
c, s = get_dft(xj)
amps = np.sqrt(c**2 + s**2)
# Put the fourier transform amplitudes into S
S[:, j] = amps
return S
There is, however, a tradeoff in the time and frequency resolution. If we want to really hone in on exactly where notes are happening in time, it means we need to choose a small window. But we only have half as many samples in frequency as there are samples in the window to spread out between $0hz$ and $sr/2$ hz, so that means we will have a low frequency resolution. By contrast, if we take large windows, then we have a lot more frequencies to spread out between $0$ and $sr/2$, so we have a higher frequency resolution. But more things are blurred together in time. So some pretty common parameters people choose are the happy medium of a 2048 window length at a 44100hz sample rate
# Time resolution vs frequency resolution tradeoff in window size
x, sr = librosa.load("chromatic2octaves.wav", sr=44100)
win = 2048
hop = 256
S = specgram(x, win, hop, sr)
times = hop*np.arange(S.shape[1])/sr
freqs = np.arange(S.shape[0])*sr/win
S = np.log10(S/np.min(S))
plt.figure(figsize=(12, 6))
plt.imshow(S, cmap='magma_r', extent = (times[0], times[-1], freqs[-1], freqs[0]), aspect='auto', interpolation='none')
plt.ylim([3000, 0])
plt.gca().invert_yaxis()
plt.colorbar()
#plt.xlim([0, 2])
In addition to notes changing, we can also see some other neat physical effects in spectrograms like vibrato. Note that if I'm going back and forth by 10hz in my base frequency, I don't have the resolution to see that at a window length of 2048, as the increment between frequencies is more than 21.5hz (as shown in the code below). But since the harmonics are multiples, we do start to see the wiggling in higher frequencies. For instance, for a vibrato of +/-10hz, the 10th harmonic would be +/-100hz. At a resolution of 21.5hz, this is about 10 pixels from top ot bottom in the spectrogram image
# Time resolution vs frequency resolution tradeoff in window size
x, sr = librosa.load("B5Vibrato.wav", sr=44100)
win = 2048
hop = 256
S = specgram(x, win, hop, sr)
times = hop*np.arange(S.shape[1])/sr
freqs = np.arange(S.shape[0])*sr/win
print("Frequency increment: {:.3f}hz".format(freqs[2]-freqs[1]))
S = np.log10(S/np.min(S))
plt.figure(figsize=(12, 6))
plt.imshow(S, cmap='magma_r', extent = (times[0], times[-1], freqs[-1], freqs[0]), aspect='auto', interpolation='none')
plt.gca().invert_yaxis()
plt.colorbar()
ipd.Audio(x, rate=sr)
10*440*2**(14/12)
# Time resolution vs frequency resolution tradeoff in window size
x, sr = librosa.load("B6Vibrato.wav", sr=44100)
win = 2048
hop = 256
S = specgram(x, win, hop, sr)
times = hop*np.arange(S.shape[1])/sr
freqs = np.arange(S.shape[0])*sr/win
print(freqs[(freqs > 800)*(freqs < 1200)])
S = np.log10(S/np.min(S))
plt.figure(figsize=(12, 6))
plt.imshow(S, cmap='magma_r', extent = (times[0], times[-1], freqs[-1], freqs[0]), aspect='auto', interpolation='none')
plt.gca().invert_yaxis()
plt.colorbar()
ipd.Audio(x, rate=sr)