118 KiB
118 KiB
In [1]:
import numpy as np import matplotlib.pyplot as plt
In [2]:
def scatter_plot(data, col=None): from mpl_toolkits.mplot3d import Axes3D %matplotlib inline fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(data[:,0], data[:,1], data[:,2], s = 0.5, color=col) plt.show()
In [3]:
N = 1000 K = 3 d = 3 L = 10
In [4]:
# Generate some data np.random.seed(42) mu_true = np.random.uniform(-L, L, size = (K, d)) data = np.random.normal(mu_true, size = (N, K, d)) data = np.vstack(data) np.random.shuffle(data)
In [5]:
if d == 3: scatter_plot(data, None)
In [6]:
mu = data[np.random.choice(range(data.shape[0]), K, replace=False)] c = np.random.randint(low=0, high=K-1, size=data.shape[0])
In [7]:
def dist_i(x, mu): # x: N datapoints, mu: N cluster centers # returns: D_{i}, squared distances from x[i] to mu[i] dist = np.zeros(x.shape[0]) for i in range(x.shape[0]): dist[i] = np.sum((x[i] - mu[i])**2) return dist def dist_ij(x, mu): # x: N datapoints, mu: K cluster centers # returns: D_{ij}, squared distances from x[i] to mu[j] dist = np.zeros((x.shape[0], mu.shape[0])) for i in range(x.shape[0]): for j in range(mu.shape[0]): dist[i, j] += np.sum((x[i] - mu[j])**2) return dist
In [8]:
ss_list = [] for n in range(10): c = np.argmin(dist_ij(data, mu), axis = 1) ss = np.mean(dist_i(data, mu[c])) ss_list.append(ss) for i in range(K): cluster_members = data[c == i] cluster_members = cluster_members.mean(axis = 0) mu[i] = cluster_members
In [9]:
plt.plot(ss_list)
Out[9]:
[<matplotlib.lines.Line2D at 0x1fa6bf68ba8>]
In [10]:
colors = np.array([plt.cm.cool(i/(K-1)) for i in range(K)])
In [11]:
if d == 3: scatter_plot(data, colors[c])