#------------------------------------------------------------------------------ # kmeans fails badly when the clusters are of different sizes #------------------------------------------------------------------------------ require(stats) # for rnorm #generate points using two normal distributions # large gaussian center = (clx,cly), sd = lsd # small gaussian center = (csx,csy), sd = ssd clx = 10 cly = 8 lsd = 2 csx = 4 csy = 2 ssd = 1 lx = rnorm(1000, mean = clx, sd = lsd) ly = rnorm(1000, mean = cly, sd = lsd) #create a 1000 by 2 matrix by "binding" these vectors - this is the large cluster L<-cbind(lx,ly) sx = rnorm(20, mean = csx, sd = ssd) sy = rnorm(20, mean = csy, sd = ssd) #create another matrix for the smaller clister S<- cbind(sx,sy) #Plot on the same graph, with two different colors plot(0:12, 0:12, type = "n") points(sx,sy,col="red") points(lx,ly,col="blue") #pause here and see the plot #combine the two clusters T<- rbind(S,L) #cluster using kmeans with k=2 twomeans = kmeans(T,2) plot(T,col=twomeans$cluster) #add centers points(twomeans$centers, col = "green", pch = 8) #------------------------------------------------------------------------------ # kmeans fails badly when the clusters are non-spherical #------------------------------------------------------------------------------ require(stats) # for rnorm # create cluster 1 clx = 10 cly = 8 lx = rnorm(1000, mean = clx, sd = 0.5) ly = rnorm(1000, mean = cly, sd =2.5) L<-cbind(lx,ly) #create cluster 2 csx = 6 csy = 8 sx = rnorm(1000, mean = csx, sd = 0.5) sy = rnorm(1000, mean = csy, sd = 2.5) S<- cbind(sx,sy) #Plotting on the same graph with two colors plot(0:12, 0:12, type = "n") points(sx,sy,col="red") points(lx,ly,col="blue") #combine data sets T<- rbind(S,L) #run kmeans twomeans = kmeans(T,2) plot(T,col=twomeans$cluster) #add centers points(twomeans$centers, col = "green", pch = 8)