RUncategorized

Clustering in R

help(kmeans)

set.seed(1)
x=matrix(rnorm(50*2), ncol=2)
plot(x)



x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4
plot(x)







km.out=kmeans(x,2,nstart=20)
km.out


K-means clustering with 2 clusters of sizes 25, 25

Cluster means:
        [,1]        [,2]
1 0.03223135  0.06924384
2 3.16866521 -3.83459093

Clustering vector:
 [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Within cluster sum of squares by cluster:
[1] 28.53417 50.97988
 (between_SS / total_SS =  79.8 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      


km.out$cluster


 [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1




plot(x, col=(km.out$cluster +1), main="K-Means Clustering Results with K=2", xlab="", ylab="", pch=20, cex=2)
points(km.out$centers,pch=8,cex=2)





set.seed (2)
km.out=kmeans(x,3,nstart=20)
km.out


K-means clustering with 3 clusters of sizes 25, 20, 5

Cluster means:
        [,1]        [,2]
1 0.03223135  0.06924384
2 3.08290361 -4.26589906
3 3.51171162 -2.10935842

Clustering vector:
 [1] 2 2 2 2 3 3 2 2 2 2 3 2 2 2 2 2 2 3 2 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Within cluster sum of squares by cluster:
[1] 28.534174 27.901401  3.740301
 (between_SS / total_SS =  84.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      




set.seed (3)
km.out=kmeans(x,3,nstart=1)
km.out$tot.withinss


[1] 60.56297


km.out=kmeans(x,3,nstart=20)
km.out$tot.withinss


[1] 60.37249




iris2 = iris
iris2$Species = NULL
kmeans.result = kmeans(iris2,3,nstart=20)
kmeans.result


K-means clustering with 3 clusters of sizes 38, 50, 62

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1     6.850000    3.073684     5.742105    2.071053
2     5.006000    3.428000     1.462000    0.246000
3     5.901613    2.748387     4.393548    1.433871

Clustering vector:
  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [51] 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[101] 1 3 1 1 1 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1 1 3 1 1 3

Within cluster sum of squares by cluster:
[1] 23.87947 15.15100 39.82097
 (between_SS / total_SS =  88.4 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      


table(iris$Species, kmeans.result$cluster)


            
              1  2  3
  setosa      0 50  0
  versicolor  2  0 48
  virginica  36  0 14




plot(iris2[c("Sepal.Length", "Sepal.Width")], col = kmeans.result$cluster)
points(kmeans.result$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3,pch = 8, cex=2)







library(cluster)
help(pam)
 pam.result = pam(iris2,3)
 pam.result


Medoids:
      ID Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,]   8          5.0         3.4          1.5         0.2
[2,]  79          6.0         2.9          4.5         1.5
[3,] 113          6.8         3.0          5.5         2.1
Clustering vector:
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [51] 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[101] 3 2 3 3 3 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3 3 2
Objective function:
    build      swap 
0.6709391 0.6542077 

Available components:
 [1] "medoids"    "id.med"     "clustering" "objective"  "isolation"  "clusinfo"   "silinfo"   
 [8] "diss"       "call"       "data"      


 table(pam.result$clustering,iris$Species)


   
    setosa versicolor virginica
  1     50          0         0
  2      0         48        14
  3      0          2        36


 plot(pam.result)






 library(RWeka)
 help(SimpleKMeans)