Clustering Analysis

Author

Habari Tanzania

Published

March 11, 2023

Modified

March 26, 2023

Load Packages and Data

pacman::p_load(poLCA, ggplot2, plotly, tidyverse)

df <- read_csv("data/touristdata_clean.csv")

Latent Class Analysis

  • As poLCA expects all variables to start at level 1, dichotomous variables that are currently in 0/1 format need to be recoded as 1/2.

  • Continuous variables also need to be binned into categories of relatively similar sizes.

  • In addition, variables with imbalance classification will be recoded into broader (and fewer) categories.

  • Lastly, some of the categories are dropped if they are deemed to be less useful for use in classification.

df_clustering <- df %>%
  select(!code) %>% 
  select(!country) %>%
  mutate(package_transport_int = recode(package_transport_int,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(package_accomodation = recode(package_accomodation,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(package_food = recode(package_food,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(package_transport_tz = recode(package_transport_tz,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(package_sightseeing = recode(package_sightseeing,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(package_guided_tour = recode(package_guided_tour,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(package_insurance = recode(package_insurance,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(first_trip_tz = recode(first_trip_tz,
                                        `1` = 2,
                                        `0` = 1)) %>%
  mutate(duration = cut(total_night_spent, breaks = c(0, 7, Inf), 
             labels = c("Within_1_week", "More_than_1_week"))) %>%
  mutate(mainland_zanzibar = cut(prop_night_spent_mainland, 
                                 breaks = c(0, 0.5, Inf), right = F, 
                                 labels = c("zanzibar", "mainland"))) %>%
  mutate(travel_with = recode(travel_with,
                              `Children` = "ImmediateFamily",
                              `Spouse` = "ImmediateFamily",
                              `Spouse and Children` = "ImmediateFamily")) %>%
  select(!total_female) %>%
  select(!total_male) %>%
  filter(total_tourist > 0) %>%
  mutate(total_tourist = cut(total_tourist,
                             breaks = c(0,1,2,Inf),
                             labels = c("1", "2", "3+"))) %>%
  filter(purpose != "Other") %>%
  mutate(purpose = recode(purpose,
                          `Business` = "Non-Leisure",
                          `Leisure and Holidays` = "Leisure",
                          `Meetings and Conference` = "Non-Leisure",
                          `Scientific and Academic` = "Non-Leisure",
                          `Visiting Friends and Relatives` = "Leisure",
                          `Volunteering` = "Non-Leisure")) %>%
  select(!main_activity) %>%
  mutate(info_source = recode(info_source,
                          `Friends, relatives` = "Word-of-Mouth",
                          `inflight magazines` = "Others",
                          `Newspaper, magazines,brochures` = "Others",
                          `others` = "Others",
                          `Radio, TV, Web` = "Others",
                          `Tanzania Mission Abroad` = "Others",
                          `Trade fair` = "Others",
                          `Travel, agent, tour operator` = "Travel agents")) %>%
  select(!night_mainland) %>%
  select(!night_zanzibar) %>%
  select(!total_night_spent) %>%
  select(!prop_night_spent_mainland) %>%
  mutate(payment_mode = recode(payment_mode,
                               `Cash` = "Cash",
                               `Credit Card` = "Non-Cash",
                               `Other` = "Non-Cash",
                               `Travellers Cheque` = "Non-Cash")) %>%
  select(!most_impressing) %>%
  mutate(total_cost = cut(total_cost, 
                          breaks = c(0, 800000, 3550000, 9950000, Inf), 
                          right = F, 
                          labels = c("800k or less", "800k - 3.55mil", 
                                     "3.55mil - 9.95mil", 
                                     "More than 9.95mil")))

Once the data set is cleaned, all variables are converted into factors and checked to ensure that there are no missing values.

##ensure all variables are factors
df_clustering[-1] <- lapply(df_clustering[-1], factor)

##ensure no missing values
sapply(df_clustering, function(x) sum(is.na(x)))
                   ID                region             age_group 
                    0                     0                     0 
          travel_with         total_tourist               purpose 
                    0                     0                     0 
          info_source      tour_arrangement package_transport_int 
                    0                     0                     0 
 package_accomodation          package_food  package_transport_tz 
                    0                     0                     0 
  package_sightseeing   package_guided_tour     package_insurance 
                    0                     0                     0 
         payment_mode         first_trip_tz            total_cost 
                    0                     0                     0 
             duration     mainland_zanzibar 
                    0                     0 

We can then proceed to run the model by specifying the the number of classes we want and the number of repeats we will run for the model. If the number of repeats > 1 (i.e. more than 1 run of the model is done), it means that a global search was done to obtain the lowest BIC score.

set.seed(1234)

f <- as.formula(cbind(region, age_group, travel_with, total_tourist, purpose,
                      info_source, tour_arrangement, package_transport_int,
                      package_accomodation, package_food, package_transport_tz,
                      package_sightseeing, package_guided_tour,
                      package_insurance, payment_mode, first_trip_tz,
                      total_cost, duration, mainland_zanzibar)~1)

##latent class analysis specifying 3 classes
##the more classes and variables there are, the longer it takes to run

##can consider limiting the nclass to between 2-7
##limit nrep to between 1 to 5
LCA_model <- poLCA(f, df_clustering, nclass=7, nrep=5, maxiter=5000)
Model 1: llik = -46192.65 ... best llik = -46192.65
Model 2: llik = -46192.96 ... best llik = -46192.65
Model 3: llik = -46230.21 ... best llik = -46192.65
Model 4: llik = -46407.38 ... best llik = -46192.65
Model 5: llik = -46575.45 ... best llik = -46192.65
Conditional item response (column) probabilities,
 by outcome variable, for each class (row) 
 
$region
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)
class 1:  0.0149 0.3204 0.0424 0.5507 0.0717
class 2:  0.1801 0.0090 0.0406 0.7513 0.0190
class 3:  0.0039 0.3514 0.0982 0.3924 0.1541
class 4:  0.7875 0.0460 0.0833 0.0766 0.0066
class 5:  0.2764 0.1393 0.0917 0.4692 0.0234
class 6:  0.0863 0.2057 0.1496 0.5289 0.0294
class 7:  0.1848 0.1407 0.1447 0.5102 0.0196

$age_group
           Pr(1)  Pr(2)  Pr(3)  Pr(4)
class 1:  0.0437 0.3619 0.4366 0.1578
class 2:  0.0840 0.5948 0.2981 0.0230
class 3:  0.2014 0.3691 0.2894 0.1400
class 4:  0.0527 0.6791 0.2473 0.0209
class 5:  0.1393 0.5564 0.2709 0.0334
class 6:  0.2881 0.5227 0.1710 0.0182
class 7:  0.3204 0.4375 0.2124 0.0297

$travel_with
           Pr(1)  Pr(2)  Pr(3)
class 1:  0.0000 0.2643 0.7357
class 2:  0.0021 0.3103 0.6876
class 3:  1.0000 0.0000 0.0000
class 4:  1.0000 0.0000 0.0000
class 5:  0.0012 0.4995 0.4993
class 6:  1.0000 0.0000 0.0000
class 7:  1.0000 0.0000 0.0000

$total_tourist
          Pr(1)  Pr(2)  Pr(3)
class 1:      0 0.6568 0.3432
class 2:      0 0.6838 0.3162
class 3:      1 0.0000 0.0000
class 4:      1 0.0000 0.0000
class 5:      0 0.6785 0.3215
class 6:      1 0.0000 0.0000
class 7:      1 0.0000 0.0000

$purpose
           Pr(1)  Pr(2)
class 1:  0.9883 0.0117
class 2:  0.9520 0.0480
class 3:  1.0000 0.0000
class 4:  0.2998 0.7002
class 5:  0.7611 0.2389
class 6:  0.7184 0.2816
class 7:  0.5279 0.4721

$info_source
           Pr(1)  Pr(2)  Pr(3)
class 1:  0.1168 0.7669 0.1162
class 2:  0.2056 0.6495 0.1448
class 3:  0.1355 0.7384 0.1261
class 4:  0.3433 0.1204 0.5363
class 5:  0.3349 0.1675 0.4976
class 6:  0.3382 0.1272 0.5347
class 7:  0.3114 0.4095 0.2790

$tour_arrangement
           Pr(1)  Pr(2)
class 1:  0.0068 0.9932
class 2:  0.0147 0.9853
class 3:  0.0066 0.9934
class 4:  0.9951 0.0049
class 5:  0.9965 0.0035
class 6:  1.0000 0.0000
class 7:  0.0200 0.9800

$package_transport_int
           Pr(1)  Pr(2)
class 1:  0.3538 0.6462
class 2:  0.0956 0.9044
class 3:  0.5913 0.4087
class 4:  0.9951 0.0049
class 5:  0.9969 0.0031
class 6:  0.9984 0.0016
class 7:  0.4459 0.5541

$package_accomodation
           Pr(1)  Pr(2)
class 1:  0.0194 0.9806
class 2:  0.0428 0.9572
class 3:  0.0033 0.9967
class 4:  1.0000 0.0000
class 5:  0.9961 0.0039
class 6:  1.0000 0.0000
class 7:  0.0235 0.9765

$package_food
           Pr(1)  Pr(2)
class 1:  0.0919 0.9081
class 2:  0.1008 0.8992
class 3:  0.0223 0.9777
class 4:  1.0000 0.0000
class 5:  1.0000 0.0000
class 6:  1.0000 0.0000
class 7:  0.1544 0.8456

$package_transport_tz
           Pr(1)  Pr(2)
class 1:  0.0975 0.9025
class 2:  0.3004 0.6996
class 3:  0.0428 0.9572
class 4:  1.0000 0.0000
class 5:  1.0000 0.0000
class 6:  1.0000 0.0000
class 7:  0.3635 0.6365

$package_sightseeing
           Pr(1)  Pr(2)
class 1:  0.1163 0.8837
class 2:  0.8326 0.1674
class 3:  0.0718 0.9282
class 4:  1.0000 0.0000
class 5:  1.0000 0.0000
class 6:  1.0000 0.0000
class 7:  0.7637 0.2363

$package_guided_tour
           Pr(1)  Pr(2)
class 1:  0.1209 0.8791
class 2:  0.7732 0.2268
class 3:  0.0754 0.9246
class 4:  0.9990 0.0010
class 5:  1.0000 0.0000
class 6:  1.0000 0.0000
class 7:  0.6311 0.3689

$package_insurance
           Pr(1)  Pr(2)
class 1:  0.6215 0.3785
class 2:  0.6575 0.3425
class 3:  0.7316 0.2684
class 4:  1.0000 0.0000
class 5:  1.0000 0.0000
class 6:  1.0000 0.0000
class 7:  0.8335 0.1665

$payment_mode
           Pr(1)  Pr(2)
class 1:  0.8255 0.1745
class 2:  0.8702 0.1298
class 3:  0.8905 0.1095
class 4:  0.8741 0.1259
class 5:  0.8455 0.1545
class 6:  0.9014 0.0986
class 7:  0.8927 0.1073

$first_trip_tz
           Pr(1)  Pr(2)
class 1:  0.0904 0.9096
class 2:  0.1829 0.8171
class 3:  0.0728 0.9272
class 4:  0.7068 0.2932
class 5:  0.3617 0.6383
class 6:  0.3580 0.6420
class 7:  0.2452 0.7548

$total_cost
           Pr(1)  Pr(2)  Pr(3)  Pr(4)
class 1:  0.0401 0.1024 0.1873 0.6701
class 2:  0.0398 0.1095 0.4077 0.4429
class 3:  0.0745 0.1395 0.5172 0.2689
class 4:  0.6345 0.3295 0.0321 0.0040
class 5:  0.2436 0.3584 0.2961 0.1020
class 6:  0.2388 0.5147 0.2195 0.0269
class 7:  0.1815 0.2345 0.4914 0.0925

$duration
           Pr(1)  Pr(2)
class 1:  0.2783 0.7217
class 2:  0.6261 0.3739
class 3:  0.4131 0.5869
class 4:  0.8709 0.1291
class 5:  0.4567 0.5433
class 6:  0.2009 0.7991
class 7:  0.4688 0.5312

$mainland_zanzibar
           Pr(1)  Pr(2)
class 1:  0.0667 0.9333
class 2:  0.7485 0.2515
class 3:  0.0245 0.9755
class 4:  0.0109 0.9891
class 5:  0.2049 0.7951
class 6:  0.1288 0.8712
class 7:  0.0839 0.9161

Estimated class population shares 
 0.2199 0.1044 0.0999 0.2073 0.1848 0.1303 0.0533 
 
Predicted class memberships (by modal posterior prob.) 
 0.2218 0.1026 0.1019 0.2112 0.1848 0.1266 0.0513 
 
========================================================= 
Fit for 7 latent classes: 
========================================================= 
number of observations: 4622 
number of estimated parameters: 209 
residual degrees of freedom: 4413 
maximum log-likelihood: -46192.65 
 
AIC(7): 92803.3
BIC(7): 94148.96
G^2(7): 21945.37 (Likelihood ratio/deviance statistic) 
X^2(7): 48495716 (Chi-square goodness of fit) 
 

Plot BIC score

##in box similar to RSME for regression tree
LCA_model$bic
[1] 94148.96

Plot AIC score

##in box similar to RSME for regression tree
LCA_model$aic
[1] 92803.3

Plot Entropy

  • Entropy indicates how accurately the model defines classes. In general, an entropy value close to 1 is ideal and above 0.8 is acceptable. Anything below 0.6 is likely to be a suggestion of poor classification.
##in box similar to RSME for regression tree
entropy <- function(p) sum(-p*log(p))
error_prior <- entropy(LCA_model$P)
error_post <- mean(apply(LCA_model$posterior, c(1,2), entropy), na.rm=T)
LCA_entropy <- (error_prior - error_post) / error_prior
LCA_entropy
[1] 0.9777623

Visualising Results

  • The model classification is appended to the original data set for plotting of the variables to compare across the classes.
df_clustering$class <- LCA_model$predclass
df_clustering$class <- factor(df_clustering$class)
##plotting results - change of x only
plot_table <- df_clustering %>%
  group_by(purpose, class) %>% ##input x to be charted
  summarise(counts = n()) %>% 
  ungroup

p1 <- ggplot(plot_table, aes(fill = purpose, y = counts, x = class)) + 
  geom_bar(position = "fill", stat = "identity")

fig1 <- ggplotly(p1)

fig1
##alternative chart where x becomes the fill
p2 <- ggplot(plot_table, aes(fill = class, y = counts, x = purpose)) + 
  geom_bar(position = "fill", stat = "identity")

fig2 <- ggplotly(p2)

fig2