Source code for ztlearn.ml.clustering.kmeans

# -*- coding: utf-8 -*-

import numpy as np

# implementation based on techniques as seen here: https://github.com/goldsborough/k-means/blob/master/python/k_means.py
[docs]class KMeans:

    def __init__(self, n_clusters = 2, max_iter = 300, random_state = None):
        self.n_clusters   = n_clusters
        self.max_iter     = max_iter
        self.random_state = random_state

    def _initialize_centroids(self, inputs):
        self.n_samples, self.n_features = np.shape(inputs)

        if self.random_state is not None:
            np.random.seed(self.random_state)

        # get random indices for centroid and use them to initialize centroids
        centroid_indices = np.random.choice(range(self.n_samples), self.n_clusters)
        self.centroids   = inputs[centroid_indices].T

        # stack inputs to form a tensor of dim [self.n_samples, self.n_features, self.n_clusters]
        self.stacked_inputs = np.stack([inputs] * self.n_clusters, axis = -1)

        self.all_rows    = np.arange(self.n_samples) # get all row indices in an np array
        self.sparse_data = np.zeros([self.n_samples, self.n_clusters, self.n_features])

[docs]    def fit(self, inputs):

        self._initialize_centroids(inputs) # initialize the centroids randomly

        for i in range(self.max_iter):

            # calculate distances of data_points from centroids
            # returns a distance tensor of dim [no of data_points, distance_from_each_centroid]
            distances = np.linalg.norm(self.stacked_inputs - self.centroids, axis = 1)

            # given n centroids the 'distance_from_each_centroid' metric consists of n diffrent distances to the n centroids
            # to find the minimum distance amongst the n diffrent distance we use the np.argmin function on this axis.
            # closest_centroid = np.argmin(distances, axis = -1). closest_centroid is a tensor of dim [no of data_points]
            # closest_centroid tensor consists of a collection of the indices of closest centroids.

            # for each data_point, the [row number, closest_centroid] index is its position in the sparse_data tensor
            # this operation fills in the sparse_data tensor positions at [all_rows, closest_centroid] with data from the inputs
            self.sparse_data[self.all_rows, np.argmin(distances, axis = -1)] = inputs

            # save current centroids for model convergence check
            prior_centroids = self.centroids

            # calculate the mean of all the newly formed clusters
            # get the sum of elements on the first axis (i.e axis = 0)
            # divide by the count of non zero elements in the sparse_data tensor on the first axis (i.e axis = 0)
            # also clip at a minimum of 1 to avoid division by zero
            self.centroids = np.divide(np.sum(self.sparse_data, axis = 0),
                                       np.clip(np.count_nonzero(self.sparse_data, axis = 0), a_min = 1, a_max = None)).T

            # determine if current centroids are diffrent with prior centroids. break if not
            if not np.any(self.centroids - prior_centroids):
                break

        return self.centroids.T