numerics-2022/Task 1/kmeans.ipynb
2022-09-24 11:25:54 +03:00

118 KiB

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
In [2]:
def scatter_plot(data, col=None):
    from mpl_toolkits.mplot3d import Axes3D 
    %matplotlib inline
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(data[:,0], data[:,1], data[:,2], s = 0.5, color=col)
    plt.show()
In [3]:
N = 1000
K = 3
d = 3
L = 10
In [4]:
# Generate some data
np.random.seed(42)
mu_true = np.random.uniform(-L, L, size = (K, d))
data = np.random.normal(mu_true, size = (N, K, d))
data = np.vstack(data)
np.random.shuffle(data)
In [5]:
if d == 3:
    scatter_plot(data, None)
In [6]:
mu = data[np.random.choice(range(data.shape[0]), K, replace=False)]
c = np.random.randint(low=0, high=K-1, size=data.shape[0])
In [7]:
def dist_i(x, mu):
    # x: N datapoints, mu: N cluster centers
    # returns: D_{i}, squared distances from x[i] to mu[i]
    dist = np.zeros(x.shape[0])
    for i in range(x.shape[0]):
        dist[i] = np.sum((x[i] - mu[i])**2)
    return dist
def dist_ij(x, mu):
    # x: N datapoints, mu: K cluster centers
    # returns: D_{ij}, squared distances from x[i] to mu[j]
    dist = np.zeros((x.shape[0], mu.shape[0]))
    for i in range(x.shape[0]):
        for j in range(mu.shape[0]):
            dist[i, j] += np.sum((x[i] - mu[j])**2)
    return dist
In [8]:
ss_list = []
for n in range(10):
    c =  np.argmin(dist_ij(data, mu), axis = 1)    
    ss = np.mean(dist_i(data, mu[c]))
    ss_list.append(ss)    
    for i in range(K):
        cluster_members = data[c == i]
        cluster_members = cluster_members.mean(axis = 0)
        mu[i] = cluster_members
In [9]:
plt.plot(ss_list)
Out[9]:
[<matplotlib.lines.Line2D at 0x1fa6bf68ba8>]
In [10]:
colors = np.array([plt.cm.cool(i/(K-1)) for i in range(K)])
In [11]:
if d == 3:
    scatter_plot(data, colors[c])