–//– Chapter 08 - Cluster Analysis –//–

[=> Case Study] | 8.3 Case Study

(-) Create exemplary dataset.

# Generate data
mydatc8_case <- data.frame(Flavor = c("Milk",   "Espresso", "Biscuit",  "Orange",   "Strawberry",   "Mango",    "Cappuccino",   "Mousse",   "Caramel",  "Nougat",   "Nut"),
                           Price = c(4.5,   5.1667, 5.0588, 3.8,    3.4444, 3.5,    5.25,   5.8571, 5.0833, 5.2727, 4.5),
                           Refreshing = c(4,    4.25,   3.8235, 5.4,    5.0556, 3.5,    3.4167, 4.4286, 4.0833, 3.6,    4),
                           Delicious = c(4.375, 3.8333, 4.7647, 3.8,    3.7778, 3.875,  4.5833, 4.9286, 4.6667, 3.9091, 4.2),
                           Healthy = c(3.875,   3.8333, 3.4375, 2.4,    3.7647, 4,  3.9167, 3.8571, 4,  4.0909, 3.9),
                           Bitter = c(3.25, 2.1667, 4.2353, 5,  3.9444, 4.625,  4.3333, 4.0714, 4,  4.0909, 3.7),
                           Light = c(3.75,  3.75,   4.4706, 5,  5.3889, 5.25,   4.4167, 5.0714, 4.25,   4.0909, 3.9),
                           Crunchy = c(4,   3.2727, 3.7647, 5,  5.0556, 5.5,    4.6667, 2.9286, 3.8182, 4.5455, 3.6),
                           Exotic = c(2.375,    2.3333, 2.7059, 4.4,    4.9444, 6,  3.6667, 2.0909, 1.5455, 1.7273, 2.2),
                           Sweet = c(4.625, 3.75,   3.5294, 4,  4.2222, 4.75,   4.5,    4.5714, 3.75,   3.9091, 3.5),
                           Fruity = c(4.125,    3.4167, 3.5294, 4.6,    5.2778, 5.375,  3.5833, 3.7857, 4.1667, 3.8182, 3.7))

print(mydatc8_case)

##        Flavor  Price Refreshing Delicious Healthy Bitter  Light Crunchy Exotic
## 1        Milk 4.5000     4.0000    4.3750  3.8750 3.2500 3.7500  4.0000 2.3750
## 2    Espresso 5.1667     4.2500    3.8333  3.8333 2.1667 3.7500  3.2727 2.3333
## 3     Biscuit 5.0588     3.8235    4.7647  3.4375 4.2353 4.4706  3.7647 2.7059
## 4      Orange 3.8000     5.4000    3.8000  2.4000 5.0000 5.0000  5.0000 4.4000
## 5  Strawberry 3.4444     5.0556    3.7778  3.7647 3.9444 5.3889  5.0556 4.9444
## 6       Mango 3.5000     3.5000    3.8750  4.0000 4.6250 5.2500  5.5000 6.0000
## 7  Cappuccino 5.2500     3.4167    4.5833  3.9167 4.3333 4.4167  4.6667 3.6667
## 8      Mousse 5.8571     4.4286    4.9286  3.8571 4.0714 5.0714  2.9286 2.0909
## 9     Caramel 5.0833     4.0833    4.6667  4.0000 4.0000 4.2500  3.8182 1.5455
## 10     Nougat 5.2727     3.6000    3.9091  4.0909 4.0909 4.0909  4.5455 1.7273
## 11        Nut 4.5000     4.0000    4.2000  3.9000 3.7000 3.9000  3.6000 2.2000
##     Sweet Fruity
## 1  4.6250 4.1250
## 2  3.7500 3.4167
## 3  3.5294 3.5294
## 4  4.0000 4.6000
## 5  4.2222 5.2778
## 6  4.7500 5.3750
## 7  4.5000 3.5833
## 8  4.5714 3.7857
## 9  3.7500 4.1667
## 10 3.9091 3.8182
## 11 3.5000 3.7000

(-) Proximity measure # Squared Euclidean distances

(-) Figure 8.21 - Squared Euclidean distance matrix of the eleven chocolate types

distdata <- 
  mydatc8_case %>% 
  dplyr::select(-Flavor) %>% 
  dist(method = "euclidean")

# Square data (Squared Euclidean distances)
distdata <- distdata^2

print(distdata)

##            1         2         3         4         5         6         7
## 2   3.773724                                                            
## 3   3.896702  6.458339                                                  
## 4  15.298125 23.553429 12.705602                                        
## 5  14.972885 23.152547 16.347466  4.181745                              
## 6  22.625000 34.903649 23.160277 10.441875  4.570110                    
## 7   4.987936 10.720875  3.158802 11.667141 12.446798 13.768911          
## 8   6.099616  8.070800  3.822922 20.051776 22.678958 32.338016  7.562721
## 9   2.748877  5.848777  2.302343 18.049297 20.318250 29.995502  6.747214
## 10  3.167791  6.498369  3.214596 17.902186 20.129763 27.088441  4.864211
## 11  1.893125  3.287069  1.801362 15.620000 17.363225 26.461875  6.020661
##            8         9        10
## 2                               
## 3                               
## 4                               
## 5                               
## 6                               
## 7                               
## 8                               
## 9   3.395515                    
## 10  6.270173  1.594029          
## 11  5.686156  1.543777  2.366126

(-) Hierarchical clustering # Single Linkage Algorithm

(-) Figure 8.20 - Dendrogram for the single-linkage method

hc_sl <- hclust(distdata, method = "single")
plot(hc_sl,
     ylab = "Scaled Height",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1)

# Add names of flavor with numbers as labels
hc_sl$labels <- mydatc8_case$Flavor

plot(hc_sl,
     ylab = "Scaled Height",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1)

(-) Figure 8.22 - Agglomeration schedule of Ward’s method for the case study

(-) Figure 8.23 - Dendrogram of cluster analysis using the Ward process

# Hierarchical clustering with Ward method
hc_w <- hclust(distdata, method = "ward.D2")
plot(hc_w,
     ylab = "Scaled Height",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1)

# Add names of flavors with numbers as labels
hc_w$labels <- mydatc8_case$Flavor

plot(hc_w,
     ylab = "Scaled Height",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1)

(-) Determine number of clusters

(-) Figure 8.24 - Development of the heterogeneity measure in the case study

# Absolute values differ from SPSS (Height= Value of the heterogeneity measure used, here: variance criterion) 


hc_w$merge

##       [,1] [,2]
##  [1,]   -9  -11
##  [2,]  -10    1
##  [3,]   -3    2
##  [4,]   -1    3
##  [5,]   -4   -5
##  [6,]   -8    4
##  [7,]   -7    6
##  [8,]   -2    7
##  [9,]   -6    5
## [10,]    8    9

hc_w[2:1]

## $height
##  [1]  1.543777  2.152186  2.772340  3.406401  4.181745  6.372982  6.845778
##  [8]  8.160512  8.987961 43.415330
## 
## $merge
##       [,1] [,2]
##  [1,]   -9  -11
##  [2,]  -10    1
##  [3,]   -3    2
##  [4,]   -1    3
##  [5,]   -4   -5
##  [6,]   -8    4
##  [7,]   -7    6
##  [8,]   -2    7
##  [9,]   -6    5
## [10,]    8    9

hc_w$height

##  [1]  1.543777  2.152186  2.772340  3.406401  4.181745  6.372982  6.845778
##  [8]  8.160512  8.987961 43.415330

clust_steps_data <- data.frame(hc_w[2:1]) %>% 
  mutate(nclust = rev(seq_along(10:1)),
         step = seq(1:10))

print(clust_steps_data)

##       height merge.1 merge.2 nclust step
## 1   1.543777      -9     -11     10    1
## 2   2.152186     -10       1      9    2
## 3   2.772340      -3       2      8    3
## 4   3.406401      -1       3      7    4
## 5   4.181745      -4      -5      6    5
## 6   6.372982      -8       4      5    6
## 7   6.845778      -7       6      4    7
## 8   8.160512      -2       7      3    8
## 9   8.987961      -6       5      2    9
## 10 43.415330       8       9      1   10

plot(clust_steps_data$nclust,clust_steps_data$height, type = "b",
     xlab = "Number of clusters", 
     ylab = "Scaled Height",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 
axis(side = 1, at = seq(1, 10, 1))

(-) 2 Cluster Solution

# Hierarchical clustering with Ward method
hc_w <- hclust(distdata, method = "ward.D2")
plot(hc_w,
     ylab = "Scaled Height",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1)

# Add names of flavors with numbers as labels
hc_w$labels <- mydatc8_case$Flavor

# 2 Cluster Solution (k = 2)
plot(hc_w, 
     ylab = "Scaled Height",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 
rect.hclust(hc_w, k = 2, border = "red")

(-) Cluster assignment for 2,3,4,5 cluster solution

(-) Figure 8.25 - Cluster membership for different solutions (2 to 5 clusters)

groups_2_clusters <- cutree(hc_w, k = 2) 
groups_3_clusters <- cutree(hc_w, k = 3) 
groups_4_clusters <- cutree(hc_w, k = 4) 
groups_5_clusters <- cutree(hc_w, k = 5) 

print("k = 2");groups_2_clusters;

## [1] "k = 2"

##       Milk   Espresso    Biscuit     Orange Strawberry      Mango Cappuccino 
##          1          1          1          2          2          2          1 
##     Mousse    Caramel     Nougat        Nut 
##          1          1          1          1

print("k = 3");groups_3_clusters;

## [1] "k = 3"

##       Milk   Espresso    Biscuit     Orange Strawberry      Mango Cappuccino 
##          1          1          1          2          2          3          1 
##     Mousse    Caramel     Nougat        Nut 
##          1          1          1          1

print("k = 4");groups_4_clusters;

## [1] "k = 4"

##       Milk   Espresso    Biscuit     Orange Strawberry      Mango Cappuccino 
##          1          2          1          3          3          4          1 
##     Mousse    Caramel     Nougat        Nut 
##          1          1          1          1

print("k = 5");groups_5_clusters;

## [1] "k = 5"

##       Milk   Espresso    Biscuit     Orange Strawberry      Mango Cappuccino 
##          1          2          1          3          3          4          5 
##     Mousse    Caramel     Nougat        Nut 
##          1          1          1          1

(-) Optimization of the cluster solution with K-Means

(-) Figure 8.26 - Cluster membership and final cluster centers according to the k-means method

# Select K 
k <- list()
input <- mydatc8_case

for(i in 1:10){
        k[[i]] <- kmeans(mydatc8_case[, c(-1)], i)
}

print(k)

## [[1]]
## K-means clustering with 1 clusters of sizes 11
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy Bitter    Light  Crunchy   Exotic
## 1 4.675727   4.141609  4.246682 3.734109  3.947 4.485318 4.195636 3.089909
##      Sweet   Fruity
## 1 4.100645 4.125255
## 
## Clustering vector:
##  [1] 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 58.14504
##  (between_SS / total_SS =   0.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[2]]
## K-means clustering with 2 clusters of sizes 8, 3
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy   Bitter    Light Crunchy   Exotic
## 1 5.086075   3.950263  4.407587 3.863812 3.730950 4.212450 3.82455 2.330575
## 2 3.581467   4.651867  3.817600 3.388233 4.523133 5.212967 5.18520 5.114800
##      Sweet   Fruity
## 1 4.016863 3.765625
## 2 4.324067 5.084267
## 
## Clustering vector:
##  [1] 1 1 1 2 2 2 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 15.97533  6.39791
##  (between_SS / total_SS =  61.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[3]]
## K-means clustering with 3 clusters of sizes 3, 1, 7
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy   Bitter    Light  Crunchy   Exotic
## 1 3.581467   4.651867  3.817600 3.388233 4.523133 5.212967 5.185200 5.114800
## 2 5.166700   4.250000  3.833300 3.833300 2.166700 3.750000 3.272700 2.333300
## 3 5.074557   3.907443  4.489629 3.868171 3.954414 4.278514 3.903386 2.330186
##      Sweet   Fruity
## 1 4.324067 5.084267
## 2 3.750000 3.416700
## 3 4.054986 3.815471
## 
## Clustering vector:
##  [1] 3 2 3 1 1 1 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  6.39791  0.00000 11.87781
##  (between_SS / total_SS =  68.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[4]]
## K-means clustering with 4 clusters of sizes 1, 5, 3, 2
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy  Bitter   Light  Crunchy   Exotic
## 1 3.500000   3.500000   3.87500 4.000000 4.62500 5.25000 5.500000 6.000000
## 2 5.304380   3.870420   4.57048 3.860440 4.14618 4.45992 3.944740 2.347260
## 3 4.722233   4.083333   4.13610 3.869433 3.03890 3.80000 3.624233 2.302767
## 4 3.622200   5.227800   3.78890 3.082350 4.47220 5.19445 5.027800 4.672200
##      Sweet   Fruity
## 1 4.750000 5.375000
## 2 4.051980 3.776660
## 3 3.958333 3.747233
## 4 4.111100 4.938900
## 
## Clustering vector:
##  [1] 3 3 2 4 4 1 2 2 2 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 0.000000 8.586505 2.984639 2.090872
##  (between_SS / total_SS =  76.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[5]]
## K-means clustering with 5 clusters of sizes 1, 1, 1, 3, 5
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy   Bitter    Light Crunchy  Exotic
## 1 5.857100   4.428600    4.9286 3.857100 4.071400 5.071400 2.92860 2.09090
## 2 5.250000   3.416700    4.5833 3.916700 4.333300 4.416700 4.66670 3.66670
## 3 5.166700   4.250000    3.8333 3.833300 2.166700 3.750000 3.27270 2.33330
## 4 3.581467   4.651867    3.8176 3.388233 4.523133 5.212967 5.18520 5.11480
## 5 4.882960   3.901360    4.3831 3.860680 3.855240 4.092300 3.94568 2.11074
##      Sweet   Fruity
## 1 4.571400 3.785700
## 2 4.500000 3.583300
## 3 3.750000 3.416700
## 4 4.324067 5.084267
## 5 3.862700 3.867860
## 
## Clustering vector:
##  [1] 5 3 5 4 4 4 2 1 5 5 5
## 
## Within cluster sum of squares by cluster:
## [1] 0.000000 0.000000 0.000000 6.397910 4.905746
##  (between_SS / total_SS =  80.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[6]]
## K-means clustering with 6 clusters of sizes 4, 1, 1, 1, 1, 3
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy   Bitter    Light Crunchy   Exotic
## 1 4.978700   3.876700  4.385125 3.857100 4.006550 4.177875  3.9321 2.044675
## 2 5.857100   4.428600  4.928600 3.857100 4.071400 5.071400  2.9286 2.090900
## 3 5.166700   4.250000  3.833300 3.833300 2.166700 3.750000  3.2727 2.333300
## 4 4.500000   4.000000  4.375000 3.875000 3.250000 3.750000  4.0000 2.375000
## 5 5.250000   3.416700  4.583300 3.916700 4.333300 4.416700  4.6667 3.666700
## 6 3.581467   4.651867  3.817600 3.388233 4.523133 5.212967  5.1852 5.114800
##      Sweet   Fruity
## 1 3.672125 3.803575
## 2 4.571400 3.785700
## 3 3.750000 3.416700
## 4 4.625000 4.125000
## 5 4.500000 3.583300
## 6 4.324067 5.084267
## 
## Clustering vector:
##  [1] 4 3 1 6 6 6 5 2 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 3.205558 0.000000 0.000000 0.000000 0.000000 6.397910
##  (between_SS / total_SS =  83.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[7]]
## K-means clustering with 7 clusters of sizes 1, 2, 1, 1, 1, 1, 4
## 
## Cluster means:
##    Price Refreshing Delicious  Healthy   Bitter    Light  Crunchy  Exotic
## 1 5.0588   3.823500    4.7647 3.437500 4.235300 4.470600 3.764700 2.70590
## 2 3.6222   5.227800    3.7889 3.082350 4.472200 5.194450 5.027800 4.67220
## 3 5.1667   4.250000    3.8333 3.833300 2.166700 3.750000 3.272700 2.33330
## 4 5.2500   3.416700    4.5833 3.916700 4.333300 4.416700 4.666700 3.66670
## 5 3.5000   3.500000    3.8750 4.000000 4.625000 5.250000 5.500000 6.00000
## 6 5.8571   4.428600    4.9286 3.857100 4.071400 5.071400 2.928600 2.09090
## 7 4.8390   3.920825    4.2877 3.966475 3.760225 3.997725 3.990925 1.96195
##      Sweet   Fruity
## 1 3.529400 3.529400
## 2 4.111100 4.938900
## 3 3.750000 3.416700
## 4 4.500000 3.583300
## 5 4.750000 5.375000
## 6 4.571400 3.785700
## 7 3.946025 3.952475
## 
## Clustering vector:
##  [1] 7 3 1 2 2 5 4 6 7 7 7
## 
## Within cluster sum of squares by cluster:
## [1] 0.000000 2.090872 0.000000 0.000000 0.000000 0.000000 3.328431
##  (between_SS / total_SS =  90.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[8]]
## K-means clustering with 8 clusters of sizes 1, 1, 1, 1, 1, 1, 1, 4
## 
## Cluster means:
##    Price Refreshing Delicious Healthy  Bitter    Light Crunchy   Exotic
## 1 5.2500     3.4167  4.583300  3.9167 4.33330 4.416700  4.6667 3.666700
## 2 3.4444     5.0556  3.777800  3.7647 3.94440 5.388900  5.0556 4.944400
## 3 4.5000     4.0000  4.375000  3.8750 3.25000 3.750000  4.0000 2.375000
## 4 3.8000     5.4000  3.800000  2.4000 5.00000 5.000000  5.0000 4.400000
## 5 3.5000     3.5000  3.875000  4.0000 4.62500 5.250000  5.5000 6.000000
## 6 5.8571     4.4286  4.928600  3.8571 4.07140 5.071400  2.9286 2.090900
## 7 5.1667     4.2500  3.833300  3.8333 2.16670 3.750000  3.2727 2.333300
## 8 4.9787     3.8767  4.385125  3.8571 4.00655 4.177875  3.9321 2.044675
##      Sweet   Fruity
## 1 4.500000 3.583300
## 2 4.222200 5.277800
## 3 4.625000 4.125000
## 4 4.000000 4.600000
## 5 4.750000 5.375000
## 6 4.571400 3.785700
## 7 3.750000 3.416700
## 8 3.672125 3.803575
## 
## Clustering vector:
##  [1] 3 7 8 4 2 5 1 6 8 8 8
## 
## Within cluster sum of squares by cluster:
## [1] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.205558
##  (between_SS / total_SS =  94.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[9]]
## K-means clustering with 9 clusters of sizes 1, 1, 1, 1, 1, 1, 1, 3, 1
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy   Bitter    Light Crunchy Exotic
## 1 5.058800   3.823500    4.7647 3.437500 4.235300 4.470600  3.7647 2.7059
## 2 5.272700   3.600000    3.9091 4.090900 4.090900 4.090900  4.5455 1.7273
## 3 5.857100   4.428600    4.9286 3.857100 4.071400 5.071400  2.9286 2.0909
## 4 4.500000   4.000000    4.2000 3.900000 3.700000 3.900000  3.6000 2.2000
## 5 4.500000   4.000000    4.3750 3.875000 3.250000 3.750000  4.0000 2.3750
## 6 5.083300   4.083300    4.6667 4.000000 4.000000 4.250000  3.8182 1.5455
## 7 5.166700   4.250000    3.8333 3.833300 2.166700 3.750000  3.2727 2.3333
## 8 3.581467   4.651867    3.8176 3.388233 4.523133 5.212967  5.1852 5.1148
## 9 5.250000   3.416700    4.5833 3.916700 4.333300 4.416700  4.6667 3.6667
##      Sweet   Fruity
## 1 3.529400 3.529400
## 2 3.909100 3.818200
## 3 4.571400 3.785700
## 4 3.500000 3.700000
## 5 4.625000 4.125000
## 6 3.750000 4.166700
## 7 3.750000 3.416700
## 8 4.324067 5.084267
## 9 4.500000 3.583300
## 
## Clustering vector:
##  [1] 5 7 1 8 8 8 9 3 6 2 4
## 
## Within cluster sum of squares by cluster:
## [1] 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 6.39791 0.00000
##  (between_SS / total_SS =  89.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## [[10]]
## K-means clustering with 10 clusters of sizes 2, 1, 1, 1, 1, 1, 1, 1, 1, 1
## 
## Cluster means:
##      Price Refreshing Delicious Healthy Bitter  Light Crunchy  Exotic  Sweet
## 1  4.79165    4.04165   4.43335  3.9500 3.8500 4.0750  3.7091 1.87275 3.6250
## 2  5.27270    3.60000   3.90910  4.0909 4.0909 4.0909  4.5455 1.72730 3.9091
## 3  5.05880    3.82350   4.76470  3.4375 4.2353 4.4706  3.7647 2.70590 3.5294
## 4  5.16670    4.25000   3.83330  3.8333 2.1667 3.7500  3.2727 2.33330 3.7500
## 5  3.44440    5.05560   3.77780  3.7647 3.9444 5.3889  5.0556 4.94440 4.2222
## 6  3.80000    5.40000   3.80000  2.4000 5.0000 5.0000  5.0000 4.40000 4.0000
## 7  3.50000    3.50000   3.87500  4.0000 4.6250 5.2500  5.5000 6.00000 4.7500
## 8  5.25000    3.41670   4.58330  3.9167 4.3333 4.4167  4.6667 3.66670 4.5000
## 9  5.85710    4.42860   4.92860  3.8571 4.0714 5.0714  2.9286 2.09090 4.5714
## 10 4.50000    4.00000   4.37500  3.8750 3.2500 3.7500  4.0000 2.37500 4.6250
##     Fruity
## 1  3.93335
## 2  3.81820
## 3  3.52940
## 4  3.41670
## 5  5.27780
## 6  4.60000
## 7  5.37500
## 8  3.58330
## 9  3.78570
## 10 4.12500
## 
## Clustering vector:
##  [1] 10  4  3  6  5  7  8  9  1  2  1
## 
## Within cluster sum of squares by cluster:
##  [1] 0.7718885 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [8] 0.0000000 0.0000000 0.0000000
##  (between_SS / total_SS =  98.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

betweenss_totss <- list()
for(i in 1:10){
        betweenss_totss[[i]] <- k[[i]]$betweenss/k[[i]]$totss
}

plot(1:10, betweenss_totss, type = "b", 
     ylab = "Between SS / Total SS", 
     xlab = "Clusters (k)",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); axis(side = 1, at = seq(1, 10, 1))

#  Calculation of K-MEANS for different k (here: k = 2)
fit_kmeans_K2 <- kmeans(input[, -1],centers = 2, nstart = 25) 

# Results: Cluster membership and cluster centers according go the k-means method (k = 2)
summary(fit_kmeans_K2); print(fit_kmeans_K2); print(fit_kmeans_K2$centers)

##              Length Class  Mode   
## cluster      11     -none- numeric
## centers      20     -none- numeric
## totss         1     -none- numeric
## withinss      2     -none- numeric
## tot.withinss  1     -none- numeric
## betweenss     1     -none- numeric
## size          2     -none- numeric
## iter          1     -none- numeric
## ifault        1     -none- numeric

## K-means clustering with 2 clusters of sizes 3, 8
## 
## Cluster means:
##      Price Refreshing Delicious  Healthy   Bitter    Light Crunchy   Exotic
## 1 3.581467   4.651867  3.817600 3.388233 4.523133 5.212967 5.18520 5.114800
## 2 5.086075   3.950263  4.407587 3.863812 3.730950 4.212450 3.82455 2.330575
##      Sweet   Fruity
## 1 4.324067 5.084267
## 2 4.016863 3.765625
## 
## Clustering vector:
##  [1] 2 2 2 1 1 1 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1]  6.39791 15.97533
##  (between_SS / total_SS =  61.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

##      Price Refreshing Delicious  Healthy   Bitter    Light Crunchy   Exotic
## 1 3.581467   4.651867  3.817600 3.388233 4.523133 5.212967 5.18520 5.114800
## 2 5.086075   3.950263  4.407587 3.863812 3.730950 4.212450 3.82455 2.330575
##      Sweet   Fruity
## 1 4.324067 5.084267
## 2 4.016863 3.765625

(-) Figure 8.27 - ANOVA table of the k-means clustering method

# ANOVA

kmeans_df <- data.frame(t(fit_kmeans_K2$centers)) %>% rownames_to_column(., var = "Criterion")
colnames(kmeans_df)[1:3] <- c("Criterion", "Cluster01", "Cluster02")
print(kmeans_df)

##     Criterion Cluster01 Cluster02
## 1       Price  3.581467  5.086075
## 2  Refreshing  4.651867  3.950263
## 3   Delicious  3.817600  4.407587
## 4     Healthy  3.388233  3.863812
## 5      Bitter  4.523133  3.730950
## 6       Light  5.212967  4.212450
## 7     Crunchy  5.185200  3.824550
## 8      Exotic  5.114800  2.330575
## 9       Sweet  4.324067  4.016863
## 10     Fruity  5.084267  3.765625

kmeans_df2 <- mydatc8_case %>% 
  mutate(Cluster = fit_kmeans_K2$cluster)

# Run all single ANOVAs (e.g., Price, Refreshing)
paste("Price"); summary(aov(kmeans_df2$Price~kmeans_df2$Cluster));

## [1] "Price"

##                    Df Sum Sq Mean Sq F value   Pr(>F)    
## kmeans_df2$Cluster  1  4.939   4.939   31.23 0.000339 ***
## Residuals           9  1.424   0.158                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

paste("Refreshing"); summary(aov(kmeans_df2$Refreshing~kmeans_df2$Cluster));

## [1] "Refreshing"

##                    Df Sum Sq Mean Sq F value Pr(>F)  
## kmeans_df2$Cluster  1  1.074  1.0740   3.435 0.0968 .
## Residuals           9  2.814  0.3127                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ...

# Run all single ANOVAs at once and print output at once
formulae <- lapply(colnames(kmeans_df2)[2:ncol(kmeans_df2)], function(x) as.formula(paste0(x, " ~ Cluster")))

cluster_aovres <- lapply(formulae, function(x) summary(aov(x, data = kmeans_df2)))

## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on the
## right-hand side and was dropped

## Warning in model.matrix.default(mt, mf, contrasts): problem with term 1 in
## model.matrix: no columns are assigned

names(cluster_aovres) <- format(formulae)
print(cluster_aovres)

## $`Price ~ Cluster`
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Cluster      1  4.939   4.939   31.23 0.000339 ***
## Residuals    9  1.424   0.158                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Refreshing ~ Cluster`
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Cluster      1  1.074  1.0740   3.435 0.0968 .
## Residuals    9  2.814  0.3127                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Delicious ~ Cluster`
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Cluster      1 0.7595  0.7595   6.078 0.0358 *
## Residuals    9 1.1246  0.1250                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Healthy ~ Cluster`
##             Df Sum Sq Mean Sq F value Pr(>F)
## Cluster      1 0.4935  0.4935   2.538  0.146
## Residuals    9 1.7497  0.1944               
## 
## $`Bitter ~ Cluster`
##             Df Sum Sq Mean Sq F value Pr(>F)
## Cluster      1  1.369  1.3692   2.943   0.12
## Residuals    9  4.187  0.4652               
## 
## $`Light ~ Cluster`
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Cluster      1  2.184  2.1841   13.41 0.00521 **
## Residuals    9  1.465  0.1628                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Crunchy ~ Cluster`
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Cluster      1  4.039   4.039   14.14 0.00448 **
## Residuals    9  2.571   0.286                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Exotic ~ Cluster`
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Cluster      1 16.913  16.913   35.35 0.000217 ***
## Residuals    9  4.306   0.478                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Sweet ~ Cluster`
##             Df Sum Sq Mean Sq F value Pr(>F)
## Cluster      1 0.2059  0.2059   0.993  0.345
## Residuals    9 1.8664  0.2074               
## 
## $`Fruity ~ Cluster`
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Cluster      1  3.794   3.794   39.48 0.000144 ***
## Residuals    9  0.865   0.096                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`Cluster ~ Cluster`
##             Df Sum Sq Mean Sq F value Pr(>F)
## Residuals   10  2.182  0.2182

(-) Figure 8.28 - Mean values and variances of the assessments in the survey population (total) and the two clusters

kmeans_df <- data.frame(t(fit_kmeans_K2$centers)) %>% rownames_to_column(., var = "Criterion")
colnames(kmeans_df)[1:3] <- c("Criterion", "Cluster01", "Cluster02")
print(kmeans_df)

##     Criterion Cluster01 Cluster02
## 1       Price  3.581467  5.086075
## 2  Refreshing  4.651867  3.950263
## 3   Delicious  3.817600  4.407587
## 4     Healthy  3.388233  3.863812
## 5      Bitter  4.523133  3.730950
## 6       Light  5.212967  4.212450
## 7     Crunchy  5.185200  3.824550
## 8      Exotic  5.114800  2.330575
## 9       Sweet  4.324067  4.016863
## 10     Fruity  5.084267  3.765625

kmeans_df2 <- mydatc8_case %>% 
  mutate(Cluster = fit_kmeans_K2$cluster)

# Mean values and variances by group
kmeans_df2 %>% 
  group_by(Cluster) %>% 
  summarise_at(., .vars = vars(2:11), .funs = c(mean, var))

## # A tibble: 2 x 21
##   Cluster Price_fn1 Refreshing_fn1 Delicious_fn1 Healthy_fn1 Bitter_fn1
##     <int>     <dbl>          <dbl>         <dbl>       <dbl>      <dbl>
## 1       1      3.58           4.65          3.82        3.39       4.52
## 2       2      5.09           3.95          4.41        3.86       3.73
## # ... with 15 more variables: Light_fn1 <dbl>, Crunchy_fn1 <dbl>,
## #   Exotic_fn1 <dbl>, Sweet_fn1 <dbl>, Fruity_fn1 <dbl>, Price_fn2 <dbl>,
## #   Refreshing_fn2 <dbl>, Delicious_fn2 <dbl>, Healthy_fn2 <dbl>,
## #   Bitter_fn2 <dbl>, Light_fn2 <dbl>, Crunchy_fn2 <dbl>, Exotic_fn2 <dbl>,
## #   Sweet_fn2 <dbl>, Fruity_fn2 <dbl>

# Mean values and variances in total with cluster allocation
kmeans_df2 %>% 
  # group_by(Cluster) %>% 
  summarise_at(., .vars = vars(2:11), .funs = c(mean, var))

##   Price_fn1 Refreshing_fn1 Delicious_fn1 Healthy_fn1 Bitter_fn1 Light_fn1
## 1  4.675727       4.141609      4.246682    3.734109      3.947  4.485318
##   Crunchy_fn1 Exotic_fn1 Sweet_fn1 Fruity_fn1 Price_fn2 Refreshing_fn2
## 1    4.195636   3.089909  4.100645   4.125255 0.6362881      0.3888235
##   Delicious_fn2 Healthy_fn2 Bitter_fn2 Light_fn2 Crunchy_fn2 Exotic_fn2
## 1     0.1884095   0.2243131  0.5556122 0.3649436   0.6610622   2.121967
##   Sweet_fn2 Fruity_fn2
## 1 0.2072298  0.4658554

# Number of flavors per cluster and total
kmeans_df2 %>% 
  group_by(Cluster) %>%
  tally()

## # A tibble: 2 x 2
##   Cluster     n
##     <int> <int>
## 1       1     3
## 2       2     8

kmeans_df2 %>% 
  tally()

##    n
## 1 11

(-) Description of the two cluster solution with t- and F-values

(-) Table 8.21 - t- and F-values of the two-cluster solution in the case study

# t-values
cluster_stats <- kmeans_df2 %>% 
  group_by(Cluster) %>% 
  summarise_at(., .vars = vars(2:11), .funs = c(mean, var)) 


### Cluster 1 (= Fruit)
C1_F_t_values <- cbind(
  cluster = paste0("Cluster = Fruit, in the following fn1 = t-value, fn2 = F-value"),
  
  # Price t-value Cluster 1 (= Fruit)
  (cluster_stats[1,2]-mean(kmeans_df2$Price))/(sqrt(var(kmeans_df2$Price))),
  # Price F-value Cluster 1 (= Fruit)
  (cluster_stats[1,12])/(var(kmeans_df2$Price)),
  
  # Refreshing t-value Cluster 1 (= Fruit)
  (cluster_stats[1,3]-mean(kmeans_df2$Refreshing))/(sqrt(var(kmeans_df2$Refreshing))),
  # Refreshing F-value Cluster 1 (= Fruit)
  (cluster_stats[1,13])/(var(kmeans_df2$Refreshing)),
  
  # Delicious t-value Cluster 1 (= Fruit)
  (cluster_stats[1,4]-mean(kmeans_df2$Delicious))/(sqrt(var(kmeans_df2$Delicious))),
  # Delicious F-value Cluster 1 (= Fruit)
  (cluster_stats[1,14])/(var(kmeans_df2$Delicious)),
  
  # Healthy t-value Cluster 1 (= Fruit)
  (cluster_stats[1,5]-mean(kmeans_df2$Healthy))/(sqrt(var(kmeans_df2$Healthy))),
  # Healthy F-value Cluster 1 (= Fruit)
  (cluster_stats[1,15])/(var(kmeans_df2$Healthy)),
  
  # Bitter t-value Cluster 1 (= Fruit)
  (cluster_stats[1,6]-mean(kmeans_df2$Bitter))/(sqrt(var(kmeans_df2$Bitter))),
  # Bitter F-value Cluster 1 (= Fruit)
  (cluster_stats[1,16])/(var(kmeans_df2$Bitter)),
  
  # Light t-value Cluster 1 (= Fruit)
  (cluster_stats[1,7]-mean(kmeans_df2$Light))/(sqrt(var(kmeans_df2$Light))),
  # Light F-value Cluster 1 (= Fruit)
  (cluster_stats[1,17])/(var(kmeans_df2$Light)),
  
  # Crunchy t-value Cluster 1 (= Fruit)
  (cluster_stats[1,8]-mean(kmeans_df2$Crunchy))/(sqrt(var(kmeans_df2$Crunchy))),
  # Crunchy F-value Cluster 1 (= Fruit)
  (cluster_stats[1,18])/(var(kmeans_df2$Crunchy)),
  
  # Exotic t-value Cluster 1 (= Fruit)
  (cluster_stats[1,9]-mean(kmeans_df2$Exotic))/(sqrt(var(kmeans_df2$Exotic))),
  # Exotic F-value Cluster 1 (= Fruit)
  (cluster_stats[1,19])/(var(kmeans_df2$Exotic)),
  
  # Sweet t-value Cluster 1 (= Fruit)
  (cluster_stats[1,10]-mean(kmeans_df2$Sweet))/(sqrt(var(kmeans_df2$Sweet))),
  # Sweet F-value Cluster 1 (= Fruit)
  (cluster_stats[1,20])/(var(kmeans_df2$Sweet)),
  
  # Fruity t-value Cluster 1 (= Fruit)
  (cluster_stats[1,11]-mean(kmeans_df2$Fruity))/(sqrt(var(kmeans_df2$Fruity))),
  # Fruity F-value Cluster 1 (= Fruit)
  (cluster_stats[1,21])/(var(kmeans_df2$Fruity)))

### Cluster 2 (= Classic)
C2_F_t_values <- cbind(
  cluster = paste0("Cluster = Classic, in the following fn1 = t-value, fn2 = F-value"),

  
  # Price t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,2]-mean(kmeans_df2$Price))/(sqrt(var(kmeans_df2$Price))),
  # Price F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,12])/(var(kmeans_df2$Price)),
  
  # Refreshing t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,3]-mean(kmeans_df2$Refreshing))/(sqrt(var(kmeans_df2$Refreshing))),
  # Refreshing F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,13])/(var(kmeans_df2$Refreshing)),
  
  # Delicious t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,4]-mean(kmeans_df2$Delicious))/(sqrt(var(kmeans_df2$Delicious))),
  # Delicious F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,14])/(var(kmeans_df2$Delicious)),
  
  # Healthy t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,5]-mean(kmeans_df2$Healthy))/(sqrt(var(kmeans_df2$Healthy))),
  # Healthy F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,15])/(var(kmeans_df2$Healthy)),
  
  # Bitter t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,6]-mean(kmeans_df2$Bitter))/(sqrt(var(kmeans_df2$Bitter))),
  # Bitter F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,16])/(var(kmeans_df2$Bitter)),

  # Light t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,7]-mean(kmeans_df2$Light))/(sqrt(var(kmeans_df2$Light))),
  # Light F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,17])/(var(kmeans_df2$Light)),

  # Crunchy t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,8]-mean(kmeans_df2$Crunchy))/(sqrt(var(kmeans_df2$Crunchy))),
  # Crunchy F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,18])/(var(kmeans_df2$Crunchy)),

  # Exotic t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,9]-mean(kmeans_df2$Exotic))/(sqrt(var(kmeans_df2$Exotic))),
  # Exotic F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,19])/(var(kmeans_df2$Exotic)),

  # Sweet t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,10]-mean(kmeans_df2$Sweet))/(sqrt(var(kmeans_df2$Sweet))),
  # Sweet F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,20])/(var(kmeans_df2$Sweet)),

  # Fruity t-values Cluster 2 (= ClassiC)
  (cluster_stats[2,11]-mean(kmeans_df2$Fruity))/(sqrt(var(kmeans_df2$Fruity))),
  # Fruity F-values Cluster 2 (= ClassiC)
  (cluster_stats[2,21])/(var(kmeans_df2$Fruity)))

# Results t-values and F-values by Cluster
print(t(C1_F_t_values)); print(t(C2_F_t_values))

##                [,1]                                                            
## cluster        "Cluster = Fruit, in the following fn1 = t-value, fn2 = F-value"
## Price_fn1      "-1.37181"                                                      
## Price_fn2      "0.05750611"                                                    
## Refreshing_fn1 "0.8183013"                                                     
## Refreshing_fn2 "2.635516"                                                      
## Delicious_fn1  "-0.9885273"                                                    
## Delicious_fn2  "0.01376937"                                                    
## Healthy_fn1    "-0.7302858"                                                    
## Healthy_fn2    "3.327026"                                                      
## Bitter_fn1     "0.7729246"                                                     
## Bitter_fn2     "0.5153873"                                                     
## Light_fn1      "1.204505"                                                      
## Light_fn2      "0.1064258"                                                     
## Crunchy_fn1    "1.21709"                                                       
## Crunchy_fn2    "0.1136007"                                                     
## Exotic_fn1     "1.390056"                                                      
## Exotic_fn2     "0.3118697"                                                     
## Sweet_fn1      "0.4907929"                                                     
## Sweet_fn2      "0.7161498"                                                     
## Fruity_fn1     "1.405072"                                                      
## Fruity_fn2     "0.3826243"

##                [,1]                                                              
## cluster        "Cluster = Classic, in the following fn1 = t-value, fn2 = F-value"
## Price_fn1      "0.5144286"                                                       
## Price_fn2      "0.3031869"                                                       
## Refreshing_fn1 "-0.306863"                                                       
## Refreshing_fn2 "0.2809712"                                                       
## Delicious_fn1  "0.3706978"                                                       
## Delicious_fn2  "0.8487954"                                                       
## Healthy_fn1    "0.2738572"                                                       
## Healthy_fn2    "0.1637163"                                                       
## Bitter_fn1     "-0.2898467"                                                      
## Bitter_fn2     "0.9292713"                                                       
## Light_fn1      "-0.4516895"                                                      
## Light_fn2      "0.5432088"                                                       
## Crunchy_fn1    "-0.4564086"                                                      
## Crunchy_fn2    "0.5232009"                                                       
## Exotic_fn1     "-0.5212711"                                                      
## Exotic_fn2     "0.2008149"                                                       
## Sweet_fn1      "-0.1840473"                                                      
## Sweet_fn2      "1.082011"                                                        
## Fruity_fn1     "-0.5269019"                                                      
## Fruity_fn2     "0.1558665"

MVA - R Notebook - Chapter 08

–//– Load R Packages –//–

Install and/or load all required R Packages

–//– Clean-Up Working Emvironment –//–

Removes all objects from the current working environment

–//– Chapter 08 - Cluster Analysis –//–

[=> Case Study] | 8.3 Case Study

(-) Create exemplary dataset.

(-) Proximity measure # Squared Euclidean distances

(-) Figure 8.21 - Squared Euclidean distance matrix of the eleven chocolate types

(-) Hierarchical clustering # Single Linkage Algorithm

(-) Figure 8.20 - Dendrogram for the single-linkage method

(-) Figure 8.22 - Agglomeration schedule of Ward’s method for the case study

(-) Figure 8.23 - Dendrogram of cluster analysis using the Ward process

(-) Determine number of clusters

(-) Figure 8.24 - Development of the heterogeneity measure in the case study

(-) 2 Cluster Solution

(-) Cluster assignment for 2,3,4,5 cluster solution

(-) Figure 8.25 - Cluster membership for different solutions (2 to 5 clusters)

(-) Optimization of the cluster solution with K-Means

(-) Figure 8.26 - Cluster membership and final cluster centers according to the k-means method

(-) Figure 8.27 - ANOVA table of the k-means clustering method

(-) Figure 8.28 - Mean values and variances of the assessments in the survey population (total) and the two clusters

(-) Description of the two cluster solution with t- and F-values

(-) Table 8.21 - t- and F-values of the two-cluster solution in the case study