thatanalyst.net Home K-Means Clustering Visualization
Post
Cancel

K-Means Clustering Visualization

Language :

INTRODUCTION

Designed a k-means clustering algorithm from scratch and deployed it as an interactive widget demonstrating its working on various data-sets.

Clustering Viz.



Code

Step 1: Importing the libraries

1
2
3
4
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random


Step 2: Importing the datasets

1
2
3
file = open('pointList.pkl', 'rb')
points = pickle.load(file)
file.close()


1
2
3
# Choosing a value-set from the collection, and a cluster-number for visualization purpose:
x, y, color = points[5]
k = 4


1
2
3
4
5
6
7
8
9
10
11
12
13
14
# This function displays the selected coordinates
def displayFig(x, y, color):
    fig = plt.figure()
    fig.set_size_inches(7, 7, forward=True)
    ax = fig.add_subplot()
    plt.scatter(x, y, color=color)
    ax.set_aspect('equal', adjustable='box')
    plt.xticks([])
    plt.yticks([])
    plt.show()


# Demonstration:
displayFig(x, y, color)

1
2
3
4
5
6
7
''' numPoints is the number of points in every dataset '''
numPoints = len(x)
# numPoints = 500

''' Selecting k(4) random points from the set for initial centroids. '''
randPointIndex = [random.randrange(numPoints) for i in range(k)]
# randPointIndex = [465, 234, 320, 294]


1
2
3
4
5
6
7
8
9
centroidsx = {}
centroidsy = {}
for i in range(k):
  centroidsx[i] = x[randPointIndex[i]]
  centroidsy[i] = y[randPointIndex[i]]
# Coordinates of those random centroids:
# x-coord : {0: 0.37141507112239547, 1: 1.7266317736575485, 2: -0.7916827203853509, 3: 1.1613071821853798}
# y-coord : {0: -0.5835011638187061, 1: 0.6283320280090743, 2: -1.9427018365096023, 3: -1.1731795814722823}


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def displayFigCent(x, y, color, centx, centy):
    fig = plt.figure()
    fig.set_size_inches(7, 7, forward=True)
    ax = fig.add_subplot()
    plt.scatter(x, y, color=color)
    plt.scatter(centx, centy,
                color = ['r', 'b', 'g', 'y'][:k],
                marker ="*", 
                edgecolor ="black",
                s = 150)
    ax.set_aspect('equal', adjustable='box')
    plt.xticks([])
    plt.yticks([])
    plt.show()

# Demonstration:
displayFigCent(x, y, color, centroidsx.values(), centroidsy.values())


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
''' Zipping the x and y coordinates into a tuple as (x, y): '''

# Centroid coordinates:
centCoord = tuple(zip(centroidsx.values(), centroidsy.values()))
centCoord = np.asarray(centCoord)

# All 500 coordinates:
allCoord = tuple(zip(x, y))
allCoord = np.asarray(allCoord)

# Centroids:
# array([[-0.99272628, -1.25591048],
#        [ 0.11123914,  0.6001856 ],
#        [-1.43752439,  1.42929442],
#        [ 1.00916879,  0.57432898]])

# First 5 coordinates:
# array([[ 0.89473904,  0.94241653],
#        [ 0.53964825,  1.14586332],
#        [ 0.3436478 ,  0.48197713],
#        [ 1.04810771, -1.05349477],
#        [-1.08404665, -1.02482478]])


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def displayGroups(groupedCoord, centCoord):
  coordnew = list(zip(*centCoord))
  xnew = {}
  ynew = {}
  figure = plt.figure()
  figure.set_size_inches(7, 7, forward=True)
  ax = figure.add_subplot()
  for i in range(k):
    xnew[i] , ynew[i] = list(zip(*groupedCoord[i]))
    plt.scatter(xnew[i] , ynew[i], 
                color = ['indianred', 'royalblue', 'springgreen', 'khaki'][i])
  plt.scatter(coordnew[0], coordnew[1],
              color = ['r', 'b', 'g', 'y'][:k],
              marker ="*", 
              edgecolor ="black",
              s = 150)

  ax.set_aspect('equal', adjustable='box')
  plt.xticks([])
  plt.yticks([])
  plt.show()


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def cluster(num_iterations):
  for i in range(num_iterations):
    groupedCoord = {}
    for i in range(k):
      groupedCoord[i] = []

    for i in allCoord:
      distances = []
      for j in range(k):
        distances.append(( np.linalg.norm(i-centCoord[j]), j ))
      minCentroid = min(distances)[1]
      groupedCoord[minCentroid].append(i)

    for j in range(k):
      groupedCoord[j] = np.asarray(groupedCoord[j])
      if len(groupedCoord[j]) != 0:
        centCoord[j] = groupedCoord[j].mean(axis=0)

  displayGroups(groupedCoord, centCoord)


1
cluster(100)

Please give the webapp a few seconds to run on Heroku servers. Thank you for your patience.

This post is licensed under CC BY 4.0 by the author.