##################################################### # # # Using the K-means algorithm # # to cluster iris species # # # # Will be using the iris dataset # # # ##################################################### # We want the irir dataset - it's available in R data(iris) # Initialize random number generator. We need this because we'll be be using the nstart # parameter in the kmeans() function that randomly initializes the process set.seed(20) # Now the idea is to cluster the iris dataset - which contains variables about their sepal/petal length and width # as well as their species - based on the following attributes: Petal.length and Petal.width . # We set number of clusters K=3 as there are 3 species of iris: setosa, versicolor and virginica. # The idea is to see if flowers of the same species will be put in the same cluster with the K-means algorithm # so that we can check how good our clustering is. # Before executing the K-means algorithm, we have to normalize the variables that will be used for # clustering since K-means uses Euclidean distance which is sensible to big values. We use min-max normalization. # Define our min-max normalization function norm <- function(x){ return( (x-min(x)) / (max(x)-min(x)) ) } # Apply min-max normalization to the clustering attributes iris$Petal.Length <- norm(iris$Petal.Length) iris$Petal.Width <- norm(iris$Petal.Width) # Petal.Length and Petal.Width are now normalized. # Now do the clustering based on Petal.Length and Petal.Width. # We use K=3 (centers parameter) since we have 3 species and in order to see how K-means performs. # Parameters: # iris[, 3:4]: specifies data that will be given as input to K-means for clustering (3->Petal.Length, 4->Petal.Width) # centers: Number of clusters to build (here 3) # iter.max: tells us how many interations K-means will make # nstart: tells us how many random samples will be tested as starts. Be best will be choosen. irisCluster <- kmeans(iris[, 3:4], centers=3, nstart = 20, iter.max=20) # A look at the results. # cluster: Vector indicating where each input data belongs. # E.g. first value means that first row of input data iris[, 3:4] will belong to this cluster. # Clusters are numbered i.e. 1,2,3,4 etc. irisCluster$cluster # withinss: Vector for within-cluster sum of squares (SSE). # The three values shown are the SSE values for each of the three clusters irisCluster$withinss # totss: Number indicating the sum of squared distances of all data points from the GLOBAL MEAN! # Global mean: one point in the entire data space clustered, represented by the mean peont of all data irisCluster$totss # betweenss: Number indicating the sum of squared distances ofmean of each cluster from the GLOBAL MEAN! # Global mean: see above irisCluster$betweenss # Now check the ratio irisCluster$betweenss / irisCluster$totss . # Why? This will give you an indication if there are three separated clusters or not. If the means of the three clusters # are close to GLOBAL MEAN this means no well separated clusters and hence betweenss small fraction of totss. # If means of the three clusters far away from the GLOBAL MEAN i.e. well separated clusters then betweenss is big fraction # of totss. Let's take a look at it... irisCluster$betweenss/irisCluster$totss # Now we plot the input data but color each input data accrding to the cluster it belongs to. # Data in the same cluster will be the same color. # How does it work? Parameter col is a vector with one color value for each data in iris[,3:4] . # Color values are simply integers e.g. 2,3, 13 etc. What we do here is use the cluster number as color value. # However we add +1 (see col =(irisCluster$cluster+1) ) in order to get a fancier color. Note that you could also # do col =(irisCluster$cluster) or col =(irisCluster$cluster+5) etc. This will just change the colors used for each data # belonging in the same cluster. # NOTE: parameter pch determines the visual shape of points, cex determines their thickness plot(iris[,3:4], col =(irisCluster$cluster+1) , main="K-Means result with 3 clusters", pch=20, cex=2) # You can also display the confusion matrix here! # The next command will tell us in which cluster each species was included. # NOTE: The optimal solution would be, data of the same species to fall into the SAME CLUSTER! table(irisCluster$cluster, iris$Species) ########################################################################################### # # # Now let's try something different: let's see how well we could cluster the same data # # for different values of K. # # # ########################################################################################### data(iris) set.seed(40) # Initialize a vector wr where we will store our metric. # In this example we will use the mean (i.e. avg) of withinss . Since we will execute K-means 19 times we will # use a vector with dimension 20. The idea is that position i of vector will store the mean withinss of K-means # with i centers. # NOTE: we will ignore wr[1] as we won't execute K-means with 1 center. wr <-rep(0, 20) # Try clustering for K=2,3,4 etc until 20 each time executing K-means . Note we start with 2 clusters (starting with K=1 # does not make really sense) for(i in 2:20) { # Cluster the data with i centers irisCluster<-kmeans(iris[,3:4], centers=i, nstart=20, iter.max=20) #Clustering done. Now store our selected metric. # Here we choose the mean (avg) of withinss. Could also be the ratio irisCluster$betweenss/irisCluster$totss wr[i] <- mean(irisCluster$withinss) } # Now plot the mean withinss values. Try to see where the elbow is! (elbow method) plot(2:20, wr[2:20], type="b", xlab="Number of Clusters", ylab="Ratio betweenss / totss", main="Assessing the Optimal Number of Clusters with the Elbow Method", pch=20, cex=2)