Data Preparation

Author

Habari Tanzania

Published

February 14, 2023

Modified

March 25, 2023

Step-by-Step Data Preparation

1. Installing and launching required R packages

pacman::p_load("ExPanDaR", "countrycode", "kableExtra", "tidyverse")

2. Loading the data

touristdata <- read_csv("data/Train.csv")
str(touristdata)
spc_tbl_ [4,809 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ ID                   : chr [1:4809] "tour_0" "tour_10" "tour_1000" "tour_1002" ...
 $ country              : chr [1:4809] "SWIZERLAND" "UNITED KINGDOM" "UNITED KINGDOM" "UNITED KINGDOM" ...
 $ age_group            : chr [1:4809] "45-64" "25-44" "25-44" "25-44" ...
 $ travel_with          : chr [1:4809] "Friends/Relatives" NA "Alone" "Spouse" ...
 $ total_female         : num [1:4809] 1 1 0 1 1 0 0 1 0 1 ...
 $ total_male           : num [1:4809] 1 0 1 1 0 1 1 1 1 0 ...
 $ purpose              : chr [1:4809] "Leisure and Holidays" "Leisure and Holidays" "Visiting Friends and Relatives" "Leisure and Holidays" ...
 $ main_activity        : chr [1:4809] "Wildlife tourism" "Cultural tourism" "Cultural tourism" "Wildlife tourism" ...
 $ info_source          : chr [1:4809] "Friends, relatives" "others" "Friends, relatives" "Travel, agent, tour operator" ...
 $ tour_arrangement     : chr [1:4809] "Independent" "Independent" "Independent" "Package Tour" ...
 $ package_transport_int: chr [1:4809] "No" "No" "No" "No" ...
 $ package_accomodation : chr [1:4809] "No" "No" "No" "Yes" ...
 $ package_food         : chr [1:4809] "No" "No" "No" "Yes" ...
 $ package_transport_tz : chr [1:4809] "No" "No" "No" "Yes" ...
 $ package_sightseeing  : chr [1:4809] "No" "No" "No" "Yes" ...
 $ package_guided_tour  : chr [1:4809] "No" "No" "No" "Yes" ...
 $ package_insurance    : chr [1:4809] "No" "No" "No" "No" ...
 $ night_mainland       : num [1:4809] 13 14 1 11 7 9 9 10 4 5 ...
 $ night_zanzibar       : num [1:4809] 0 7 31 0 4 3 0 3 0 0 ...
 $ payment_mode         : chr [1:4809] "Cash" "Cash" "Cash" "Cash" ...
 $ first_trip_tz        : chr [1:4809] "No" "Yes" "No" "Yes" ...
 $ most_impressing      : chr [1:4809] "Friendly People" "Wonderful Country, Landscape, Nature" "Excellent Experience" "Friendly People" ...
 $ total_cost           : num [1:4809] 674603 3214907 3315000 7790250 1657500 ...
 - attr(*, "spec")=
  .. cols(
  ..   ID = col_character(),
  ..   country = col_character(),
  ..   age_group = col_character(),
  ..   travel_with = col_character(),
  ..   total_female = col_double(),
  ..   total_male = col_double(),
  ..   purpose = col_character(),
  ..   main_activity = col_character(),
  ..   info_source = col_character(),
  ..   tour_arrangement = col_character(),
  ..   package_transport_int = col_character(),
  ..   package_accomodation = col_character(),
  ..   package_food = col_character(),
  ..   package_transport_tz = col_character(),
  ..   package_sightseeing = col_character(),
  ..   package_guided_tour = col_character(),
  ..   package_insurance = col_character(),
  ..   night_mainland = col_double(),
  ..   night_zanzibar = col_double(),
  ..   payment_mode = col_character(),
  ..   first_trip_tz = col_character(),
  ..   most_impressing = col_character(),
  ..   total_cost = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 

3. Dataset overview

3.1 Unique Countries

unique(touristdata$country)
  [1] "SWIZERLAND"               "UNITED KINGDOM"          
  [3] "CHINA"                    "SOUTH AFRICA"            
  [5] "UNITED STATES OF AMERICA" "NIGERIA"                 
  [7] "INDIA"                    "BRAZIL"                  
  [9] "CANADA"                   "MALT"                    
 [11] "MOZAMBIQUE"               "RWANDA"                  
 [13] "AUSTRIA"                  "MYANMAR"                 
 [15] "GERMANY"                  "KENYA"                   
 [17] "ALGERIA"                  "IRELAND"                 
 [19] "DENMARK"                  "SPAIN"                   
 [21] "FRANCE"                   "ITALY"                   
 [23] "EGYPT"                    "QATAR"                   
 [25] "MALAWI"                   "JAPAN"                   
 [27] "SWEDEN"                   "NETHERLANDS"             
 [29] "UAE"                      "UGANDA"                  
 [31] "AUSTRALIA"                "YEMEN"                   
 [33] "NEW ZEALAND"              "BELGIUM"                 
 [35] "NORWAY"                   "ZIMBABWE"                
 [37] "ZAMBIA"                   "CONGO"                   
 [39] "BURGARIA"                 "PAKISTAN"                
 [41] "GREECE"                   "MAURITIUS"               
 [43] "DRC"                      "OMAN"                    
 [45] "PORTUGAL"                 "KOREA"                   
 [47] "SWAZILAND"                "TUNISIA"                 
 [49] "KUWAIT"                   "DOMINICA"                
 [51] "ISRAEL"                   "FINLAND"                 
 [53] "CZECH REPUBLIC"           "UKRAIN"                  
 [55] "ETHIOPIA"                 "BURUNDI"                 
 [57] "SCOTLAND"                 "RUSSIA"                  
 [59] "GHANA"                    "NIGER"                   
 [61] "MALAYSIA"                 "COLOMBIA"                
 [63] "LUXEMBOURG"               "NEPAL"                   
 [65] "POLAND"                   "SINGAPORE"               
 [67] "LITHUANIA"                "HUNGARY"                 
 [69] "INDONESIA"                "TURKEY"                  
 [71] "TRINIDAD TOBACCO"         "IRAQ"                    
 [73] "SLOVENIA"                 "UNITED ARAB EMIRATES"    
 [75] "COMORO"                   "SRI LANKA"               
 [77] "IRAN"                     "MONTENEGRO"              
 [79] "ANGOLA"                   "LEBANON"                 
 [81] "SLOVAKIA"                 "ROMANIA"                 
 [83] "MEXICO"                   "LATVIA"                  
 [85] "CROATIA"                  "CAPE VERDE"              
 [87] "SUDAN"                    "COSTARICA"               
 [89] "CHILE"                    "NAMIBIA"                 
 [91] "TAIWAN"                   "SERBIA"                  
 [93] "LESOTHO"                  "GEORGIA"                 
 [95] "PHILIPINES"               "IVORY COAST"             
 [97] "MADAGASCAR"               "DJIBOUT"                 
 [99] "CYPRUS"                   "ARGENTINA"               
[101] "URUGUAY"                  "MORROCO"                 
[103] "THAILAND"                 "BERMUDA"                 
[105] "ESTONIA"                 

3.2 Missing values

missing.values <- touristdata %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing==T) %>%
    select(-is.missing) %>%
    arrange(desc(num.missing)) 
missing.values %>% kable()
key num.missing
travel_with 1114
most_impressing 313
total_male 5
total_female 3

Visualising missing values

missing_values <- function(origin, vari = "country"){
prepare_missing_values_graph(origin, ts_id = vari)
}
missing_values(touristdata, "country")

missing_values(touristdata, "age_group")

3.3 Treating Missing values

convertbinary <- function(x){
    as.factor(ifelse(x=="Yes", 1,0))
}
touristdata_min_na <- touristdata %>%
  mutate(total_tourist = total_male + total_female,
         .after = total_male) %>%
  mutate(total_night_spent = night_mainland + night_zanzibar,
         prop_night_spent_mainland = round(night_mainland/total_night_spent,2),
         .after = night_zanzibar) %>%
  mutate(travel_with = case_when(total_tourist == 1 ~ "Alone",
                                 TRUE ~ travel_with)) %>%
  mutate(most_impressing = case_when(is.na(most_impressing) ~ "No comments",
                                     TRUE ~ most_impressing)) %>%
  mutate(across(package_transport_int:package_insurance, convertbinary)) %>%
  mutate(across(first_trip_tz, convertbinary))
missing.values.clean <- touristdata_min_na %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing==T) %>%
    select(-is.missing) %>%
    arrange(desc(num.missing)) 
missing.values.clean %>% kable()
key num.missing
travel_with 29
total_tourist 8
total_male 5
total_female 3
missing_values(touristdata_min_na, "country")

touristdata_min_na %>%
  filter(country == "CHILE")
# A tibble: 2 × 26
  ID     country age_g…¹ trave…² total…³ total…⁴ total…⁵ purpose main_…⁶ info_…⁷
  <chr>  <chr>   <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>   <chr>   <chr>  
1 tour_… CHILE   25-44   Friend…       1       1       2 Leisur… Wildli… Friend…
2 tour_… CHILE   25-44   Alone         0       1       1 Leisur… Wildli… Travel…
# … with 16 more variables: tour_arrangement <chr>,
#   package_transport_int <fct>, package_accomodation <fct>,
#   package_food <fct>, package_transport_tz <fct>, package_sightseeing <fct>,
#   package_guided_tour <fct>, package_insurance <fct>, night_mainland <dbl>,
#   night_zanzibar <dbl>, total_night_spent <dbl>,
#   prop_night_spent_mainland <dbl>, payment_mode <chr>, first_trip_tz <fct>,
#   most_impressing <chr>, total_cost <dbl>, and abbreviated variable names …
touristdata_no_na <- touristdata_min_na %>% drop_na()
touristdata_no_na
# A tibble: 4,762 × 26
   ID    country age_g…¹ trave…² total…³ total…⁴ total…⁵ purpose main_…⁶ info_…⁷
   <chr> <chr>   <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>   <chr>   <chr>  
 1 tour… SWIZER… 45-64   Friend…       1       1       2 Leisur… Wildli… Friend…
 2 tour… UNITED… 25-44   Alone         1       0       1 Leisur… Cultur… others 
 3 tour… UNITED… 25-44   Alone         0       1       1 Visiti… Cultur… Friend…
 4 tour… UNITED… 25-44   Spouse        1       1       2 Leisur… Wildli… Travel…
 5 tour… CHINA   1-24    Alone         1       0       1 Leisur… Wildli… Travel…
 6 tour… UNITED… 25-44   Alone         0       1       1 Leisur… Wildli… Travel…
 7 tour… SOUTH … 45-64   Alone         0       1       1 Busine… Mounta… Friend…
 8 tour… UNITED… 45-64   Friend…       1       1       2 Leisur… Wildli… Travel…
 9 tour… NIGERIA 25-44   Alone         0       1       1 Leisur… Cultur… Travel…
10 tour… INDIA   25-44   Alone         1       0       1 Busine… Wildli… Travel…
# … with 4,752 more rows, 16 more variables: tour_arrangement <chr>,
#   package_transport_int <fct>, package_accomodation <fct>,
#   package_food <fct>, package_transport_tz <fct>, package_sightseeing <fct>,
#   package_guided_tour <fct>, package_insurance <fct>, night_mainland <dbl>,
#   night_zanzibar <dbl>, total_night_spent <dbl>,
#   prop_night_spent_mainland <dbl>, payment_mode <chr>, first_trip_tz <fct>,
#   most_impressing <chr>, total_cost <dbl>, and abbreviated variable names …

3.4 Matching countries to continent

unique(touristdata_no_na$country) 
  [1] "SWIZERLAND"               "UNITED KINGDOM"          
  [3] "CHINA"                    "SOUTH AFRICA"            
  [5] "UNITED STATES OF AMERICA" "NIGERIA"                 
  [7] "INDIA"                    "BRAZIL"                  
  [9] "CANADA"                   "MALT"                    
 [11] "MOZAMBIQUE"               "RWANDA"                  
 [13] "AUSTRIA"                  "MYANMAR"                 
 [15] "GERMANY"                  "KENYA"                   
 [17] "ALGERIA"                  "IRELAND"                 
 [19] "DENMARK"                  "SPAIN"                   
 [21] "FRANCE"                   "ITALY"                   
 [23] "EGYPT"                    "QATAR"                   
 [25] "MALAWI"                   "JAPAN"                   
 [27] "SWEDEN"                   "NETHERLANDS"             
 [29] "UAE"                      "UGANDA"                  
 [31] "AUSTRALIA"                "YEMEN"                   
 [33] "NEW ZEALAND"              "BELGIUM"                 
 [35] "NORWAY"                   "ZIMBABWE"                
 [37] "ZAMBIA"                   "CONGO"                   
 [39] "BURGARIA"                 "PAKISTAN"                
 [41] "GREECE"                   "MAURITIUS"               
 [43] "DRC"                      "OMAN"                    
 [45] "PORTUGAL"                 "KOREA"                   
 [47] "SWAZILAND"                "TUNISIA"                 
 [49] "KUWAIT"                   "DOMINICA"                
 [51] "ISRAEL"                   "FINLAND"                 
 [53] "CZECH REPUBLIC"           "UKRAIN"                  
 [55] "ETHIOPIA"                 "BURUNDI"                 
 [57] "SCOTLAND"                 "RUSSIA"                  
 [59] "GHANA"                    "NIGER"                   
 [61] "MALAYSIA"                 "COLOMBIA"                
 [63] "LUXEMBOURG"               "NEPAL"                   
 [65] "POLAND"                   "SINGAPORE"               
 [67] "LITHUANIA"                "HUNGARY"                 
 [69] "INDONESIA"                "TURKEY"                  
 [71] "TRINIDAD TOBACCO"         "IRAQ"                    
 [73] "SLOVENIA"                 "UNITED ARAB EMIRATES"    
 [75] "COMORO"                   "SRI LANKA"               
 [77] "IRAN"                     "MONTENEGRO"              
 [79] "ANGOLA"                   "LEBANON"                 
 [81] "SLOVAKIA"                 "ROMANIA"                 
 [83] "MEXICO"                   "LATVIA"                  
 [85] "CROATIA"                  "CAPE VERDE"              
 [87] "SUDAN"                    "COSTARICA"               
 [89] "NAMIBIA"                  "TAIWAN"                  
 [91] "SERBIA"                   "LESOTHO"                 
 [93] "GEORGIA"                  "PHILIPINES"              
 [95] "IVORY COAST"              "MADAGASCAR"              
 [97] "DJIBOUT"                  "CYPRUS"                  
 [99] "ARGENTINA"                "URUGUAY"                 
[101] "MORROCO"                  "THAILAND"                
[103] "CHILE"                    "BERMUDA"                 
[105] "ESTONIA"                 
touristdata_clean <- touristdata_no_na %>% 
  mutate(country = case_when(country == "BURGARIA" ~ "BULGARIA",
                             country == "DJIBOUT" ~ "DJIBOUTI",
                             country == "MALT" ~ "MALTA",
                             country == "MORROCO" ~ "MOROCCO",
                             country == "PHILIPINES" ~ "PHILIPPINES",
                             country == "SCOTLAND" ~ "UNITED KINGDOM",
                             country == "SWIZERLAND" ~ "SWITZERLAND",
                             TRUE ~ country)) %>%
  mutate(region = countrycode(sourcevar = country,
                              origin = "country.name",
                              destination = "continent"),
         code = countrycode(sourcevar = country,
                            origin = "country.name",
                            destination = "iso3c"),
         .after = country)

touristdata_clean
# A tibble: 4,762 × 28
   ID       country region code  age_g…¹ trave…² total…³ total…⁴ total…⁵ purpose
   <chr>    <chr>   <chr>  <chr> <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>  
 1 tour_0   SWITZE… Europe CHE   45-64   Friend…       1       1       2 Leisur…
 2 tour_10  UNITED… Europe GBR   25-44   Alone         1       0       1 Leisur…
 3 tour_10… UNITED… Europe GBR   25-44   Alone         0       1       1 Visiti…
 4 tour_10… UNITED… Europe GBR   25-44   Spouse        1       1       2 Leisur…
 5 tour_10… CHINA   Asia   CHN   1-24    Alone         1       0       1 Leisur…
 6 tour_10… UNITED… Europe GBR   25-44   Alone         0       1       1 Leisur…
 7 tour_10… SOUTH … Africa ZAF   45-64   Alone         0       1       1 Busine…
 8 tour_10… UNITED… Ameri… USA   45-64   Friend…       1       1       2 Leisur…
 9 tour_101 NIGERIA Africa NGA   25-44   Alone         0       1       1 Leisur…
10 tour_10… INDIA   Asia   IND   25-44   Alone         1       0       1 Busine…
# … with 4,752 more rows, 18 more variables: main_activity <chr>,
#   info_source <chr>, tour_arrangement <chr>, package_transport_int <fct>,
#   package_accomodation <fct>, package_food <fct>, package_transport_tz <fct>,
#   package_sightseeing <fct>, package_guided_tour <fct>,
#   package_insurance <fct>, night_mainland <dbl>, night_zanzibar <dbl>,
#   total_night_spent <dbl>, prop_night_spent_mainland <dbl>,
#   payment_mode <chr>, first_trip_tz <fct>, most_impressing <chr>, …
#write_csv(touristdata_clean,'C:/michaeldjo/VAP/DataPrep/data/touristdata_clean.csv')