Nonnegative Matrix Factorization

Kullback-Liebler Divergence Version

In [1]:
%load_ext autoreload
%autoreload 2
import librosa
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
from scipy import signal
from spectrogramtools import *
from collections import OrderedDict
import warnings
warnings.filterwarnings("ignore")

def make_html_audio(ys, sr, width=100):
    clips = []
    for y in ys:
        audio = ipd.Audio(y, rate=sr)
        audio_html = audio._repr_html_().replace('\n', '').strip()
        audio_html = audio_html.replace('<audio ', '<audio style="width: {}px; "'.format(width))
        clips.append(audio_html)
    return clips
In [2]:
y, sr = librosa.load("doves.wav", sr=22050)
win_length = 2048*4
hop_length = 512

S = STFT(y, win_length, hop_length)
V = np.abs(S)

Kullback-Liebler Divergence Version

We seek a $W$ and an $H$ so that we minimize

$\sum_{i, j} V[i, j] \left( \log \frac{V[i, j]}{WH[i, j]} \right) - V[i, j] + WH[i, j]$

Update Rules:

Let $V_L[i, j] = V[i, j] / (WH)[i, j]$

$ H[i, j] = H[i, j] \left( \frac{ (W^T V_L)[i, j] }{{\sum_a}W[a, j]} \right) $

$ W[i, j] = W[i, j] \left( \frac{ (V_L H^T)[i, j] }{{\sum_b}H[i, b]} \right) $

In [3]:
M = S.shape[0]
N = S.shape[1]
K = 3
W = np.random.rand(M, K)
H = np.random.rand(K, N)
n_iters = 100

## TODO: Apply multiplicative update rules n_iters times in a loop
#KL Divergence Version
for i in range(n_iters):
    VL = V/(W.dot(H))
    H *= (W.T).dot(VL)/np.sum(W, 0)[:, None]
    W *= (VL.dot(H.T))/np.sum(H, 1)[None, :]

Below we look at two things:

  1. A Griffin-Lim inversion of each column of W
  2. The result of zeroing out all rows except for one, then using this to create a softmax filter to extract a filtered version of the spectrogram corresponding to each column of W
In [4]:
ws = []
ys = []
WH = W.dot(H)
K = H.shape[0]
for i in range(K):
    Hi = np.zeros_like(H)
    Hi[i, :] = H[i, :]
    WHi = np.dot(W, Hi)
    Wi = W[:, i]
    Wi = np.reshape(Wi, (Wi.size, 1))
    Wi = Wi*np.ones((1, int(win_length/hop_length)))
    wi = griffinLimInverse(Wi, win_length, hop_length)
    ws.append(wi)
    Si = S*WHi/WH
    yi = iSTFT(Si, win_length, hop_length)
    ys.append(yi)
ys = make_html_audio(ys, sr, width=200)
ws = make_html_audio(ws, sr, width=100)
pd.set_option('display.max_colwidth', None) 
df = pd.DataFrame(OrderedDict([("Components", ws), ("Filtered", ys)]))
ipd.HTML(df.to_html(escape=False, float_format='%.2f'))
Out[4]:
Components Filtered
0
1
2

We also listen a Griffin-Lim inverse of $WH$ to see how well it approximated the original spectrogram. We shouldn't expect this to be perfect, since we're only using 3 components

In [5]:
y = griffinLimInverse(WH, win_length, hop_length)
ipd.Audio(y, rate=sr)
Out[5]:
In [ ]: