pacman::p_load("ExPanDaR", "countrycode", "kableExtra", "tidyverse")Data Preparation
Step-by-Step Data Preparation
1. Installing and launching required R packages
2. Loading the data
touristdata <- read_csv("data/Train.csv")
str(touristdata)spc_tbl_ [4,809 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ ID : chr [1:4809] "tour_0" "tour_10" "tour_1000" "tour_1002" ...
$ country : chr [1:4809] "SWIZERLAND" "UNITED KINGDOM" "UNITED KINGDOM" "UNITED KINGDOM" ...
$ age_group : chr [1:4809] "45-64" "25-44" "25-44" "25-44" ...
$ travel_with : chr [1:4809] "Friends/Relatives" NA "Alone" "Spouse" ...
$ total_female : num [1:4809] 1 1 0 1 1 0 0 1 0 1 ...
$ total_male : num [1:4809] 1 0 1 1 0 1 1 1 1 0 ...
$ purpose : chr [1:4809] "Leisure and Holidays" "Leisure and Holidays" "Visiting Friends and Relatives" "Leisure and Holidays" ...
$ main_activity : chr [1:4809] "Wildlife tourism" "Cultural tourism" "Cultural tourism" "Wildlife tourism" ...
$ info_source : chr [1:4809] "Friends, relatives" "others" "Friends, relatives" "Travel, agent, tour operator" ...
$ tour_arrangement : chr [1:4809] "Independent" "Independent" "Independent" "Package Tour" ...
$ package_transport_int: chr [1:4809] "No" "No" "No" "No" ...
$ package_accomodation : chr [1:4809] "No" "No" "No" "Yes" ...
$ package_food : chr [1:4809] "No" "No" "No" "Yes" ...
$ package_transport_tz : chr [1:4809] "No" "No" "No" "Yes" ...
$ package_sightseeing : chr [1:4809] "No" "No" "No" "Yes" ...
$ package_guided_tour : chr [1:4809] "No" "No" "No" "Yes" ...
$ package_insurance : chr [1:4809] "No" "No" "No" "No" ...
$ night_mainland : num [1:4809] 13 14 1 11 7 9 9 10 4 5 ...
$ night_zanzibar : num [1:4809] 0 7 31 0 4 3 0 3 0 0 ...
$ payment_mode : chr [1:4809] "Cash" "Cash" "Cash" "Cash" ...
$ first_trip_tz : chr [1:4809] "No" "Yes" "No" "Yes" ...
$ most_impressing : chr [1:4809] "Friendly People" "Wonderful Country, Landscape, Nature" "Excellent Experience" "Friendly People" ...
$ total_cost : num [1:4809] 674603 3214907 3315000 7790250 1657500 ...
- attr(*, "spec")=
.. cols(
.. ID = col_character(),
.. country = col_character(),
.. age_group = col_character(),
.. travel_with = col_character(),
.. total_female = col_double(),
.. total_male = col_double(),
.. purpose = col_character(),
.. main_activity = col_character(),
.. info_source = col_character(),
.. tour_arrangement = col_character(),
.. package_transport_int = col_character(),
.. package_accomodation = col_character(),
.. package_food = col_character(),
.. package_transport_tz = col_character(),
.. package_sightseeing = col_character(),
.. package_guided_tour = col_character(),
.. package_insurance = col_character(),
.. night_mainland = col_double(),
.. night_zanzibar = col_double(),
.. payment_mode = col_character(),
.. first_trip_tz = col_character(),
.. most_impressing = col_character(),
.. total_cost = col_double()
.. )
- attr(*, "problems")=<externalptr>
3. Dataset overview
3.1 Unique Countries
unique(touristdata$country) [1] "SWIZERLAND" "UNITED KINGDOM"
[3] "CHINA" "SOUTH AFRICA"
[5] "UNITED STATES OF AMERICA" "NIGERIA"
[7] "INDIA" "BRAZIL"
[9] "CANADA" "MALT"
[11] "MOZAMBIQUE" "RWANDA"
[13] "AUSTRIA" "MYANMAR"
[15] "GERMANY" "KENYA"
[17] "ALGERIA" "IRELAND"
[19] "DENMARK" "SPAIN"
[21] "FRANCE" "ITALY"
[23] "EGYPT" "QATAR"
[25] "MALAWI" "JAPAN"
[27] "SWEDEN" "NETHERLANDS"
[29] "UAE" "UGANDA"
[31] "AUSTRALIA" "YEMEN"
[33] "NEW ZEALAND" "BELGIUM"
[35] "NORWAY" "ZIMBABWE"
[37] "ZAMBIA" "CONGO"
[39] "BURGARIA" "PAKISTAN"
[41] "GREECE" "MAURITIUS"
[43] "DRC" "OMAN"
[45] "PORTUGAL" "KOREA"
[47] "SWAZILAND" "TUNISIA"
[49] "KUWAIT" "DOMINICA"
[51] "ISRAEL" "FINLAND"
[53] "CZECH REPUBLIC" "UKRAIN"
[55] "ETHIOPIA" "BURUNDI"
[57] "SCOTLAND" "RUSSIA"
[59] "GHANA" "NIGER"
[61] "MALAYSIA" "COLOMBIA"
[63] "LUXEMBOURG" "NEPAL"
[65] "POLAND" "SINGAPORE"
[67] "LITHUANIA" "HUNGARY"
[69] "INDONESIA" "TURKEY"
[71] "TRINIDAD TOBACCO" "IRAQ"
[73] "SLOVENIA" "UNITED ARAB EMIRATES"
[75] "COMORO" "SRI LANKA"
[77] "IRAN" "MONTENEGRO"
[79] "ANGOLA" "LEBANON"
[81] "SLOVAKIA" "ROMANIA"
[83] "MEXICO" "LATVIA"
[85] "CROATIA" "CAPE VERDE"
[87] "SUDAN" "COSTARICA"
[89] "CHILE" "NAMIBIA"
[91] "TAIWAN" "SERBIA"
[93] "LESOTHO" "GEORGIA"
[95] "PHILIPINES" "IVORY COAST"
[97] "MADAGASCAR" "DJIBOUT"
[99] "CYPRUS" "ARGENTINA"
[101] "URUGUAY" "MORROCO"
[103] "THAILAND" "BERMUDA"
[105] "ESTONIA"
3.2 Missing values
missing.values <- touristdata %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing))
missing.values %>% kable()| key | num.missing |
|---|---|
| travel_with | 1114 |
| most_impressing | 313 |
| total_male | 5 |
| total_female | 3 |
Visualising missing values
missing_values <- function(origin, vari = "country"){
prepare_missing_values_graph(origin, ts_id = vari)
}missing_values(touristdata, "country")
missing_values(touristdata, "age_group")
3.3 Treating Missing values
convertbinary <- function(x){
as.factor(ifelse(x=="Yes", 1,0))
}touristdata_min_na <- touristdata %>%
mutate(total_tourist = total_male + total_female,
.after = total_male) %>%
mutate(total_night_spent = night_mainland + night_zanzibar,
prop_night_spent_mainland = round(night_mainland/total_night_spent,2),
.after = night_zanzibar) %>%
mutate(travel_with = case_when(total_tourist == 1 ~ "Alone",
TRUE ~ travel_with)) %>%
mutate(most_impressing = case_when(is.na(most_impressing) ~ "No comments",
TRUE ~ most_impressing)) %>%
mutate(across(package_transport_int:package_insurance, convertbinary)) %>%
mutate(across(first_trip_tz, convertbinary))missing.values.clean <- touristdata_min_na %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing==T) %>%
select(-is.missing) %>%
arrange(desc(num.missing))
missing.values.clean %>% kable()| key | num.missing |
|---|---|
| travel_with | 29 |
| total_tourist | 8 |
| total_male | 5 |
| total_female | 3 |
missing_values(touristdata_min_na, "country")
touristdata_min_na %>%
filter(country == "CHILE")# A tibble: 2 × 26
ID country age_g…¹ trave…² total…³ total…⁴ total…⁵ purpose main_…⁶ info_…⁷
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 tour_… CHILE 25-44 Friend… 1 1 2 Leisur… Wildli… Friend…
2 tour_… CHILE 25-44 Alone 0 1 1 Leisur… Wildli… Travel…
# … with 16 more variables: tour_arrangement <chr>,
# package_transport_int <fct>, package_accomodation <fct>,
# package_food <fct>, package_transport_tz <fct>, package_sightseeing <fct>,
# package_guided_tour <fct>, package_insurance <fct>, night_mainland <dbl>,
# night_zanzibar <dbl>, total_night_spent <dbl>,
# prop_night_spent_mainland <dbl>, payment_mode <chr>, first_trip_tz <fct>,
# most_impressing <chr>, total_cost <dbl>, and abbreviated variable names …
touristdata_no_na <- touristdata_min_na %>% drop_na()
touristdata_no_na# A tibble: 4,762 × 26
ID country age_g…¹ trave…² total…³ total…⁴ total…⁵ purpose main_…⁶ info_…⁷
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 tour… SWIZER… 45-64 Friend… 1 1 2 Leisur… Wildli… Friend…
2 tour… UNITED… 25-44 Alone 1 0 1 Leisur… Cultur… others
3 tour… UNITED… 25-44 Alone 0 1 1 Visiti… Cultur… Friend…
4 tour… UNITED… 25-44 Spouse 1 1 2 Leisur… Wildli… Travel…
5 tour… CHINA 1-24 Alone 1 0 1 Leisur… Wildli… Travel…
6 tour… UNITED… 25-44 Alone 0 1 1 Leisur… Wildli… Travel…
7 tour… SOUTH … 45-64 Alone 0 1 1 Busine… Mounta… Friend…
8 tour… UNITED… 45-64 Friend… 1 1 2 Leisur… Wildli… Travel…
9 tour… NIGERIA 25-44 Alone 0 1 1 Leisur… Cultur… Travel…
10 tour… INDIA 25-44 Alone 1 0 1 Busine… Wildli… Travel…
# … with 4,752 more rows, 16 more variables: tour_arrangement <chr>,
# package_transport_int <fct>, package_accomodation <fct>,
# package_food <fct>, package_transport_tz <fct>, package_sightseeing <fct>,
# package_guided_tour <fct>, package_insurance <fct>, night_mainland <dbl>,
# night_zanzibar <dbl>, total_night_spent <dbl>,
# prop_night_spent_mainland <dbl>, payment_mode <chr>, first_trip_tz <fct>,
# most_impressing <chr>, total_cost <dbl>, and abbreviated variable names …
3.4 Matching countries to continent
unique(touristdata_no_na$country) [1] "SWIZERLAND" "UNITED KINGDOM"
[3] "CHINA" "SOUTH AFRICA"
[5] "UNITED STATES OF AMERICA" "NIGERIA"
[7] "INDIA" "BRAZIL"
[9] "CANADA" "MALT"
[11] "MOZAMBIQUE" "RWANDA"
[13] "AUSTRIA" "MYANMAR"
[15] "GERMANY" "KENYA"
[17] "ALGERIA" "IRELAND"
[19] "DENMARK" "SPAIN"
[21] "FRANCE" "ITALY"
[23] "EGYPT" "QATAR"
[25] "MALAWI" "JAPAN"
[27] "SWEDEN" "NETHERLANDS"
[29] "UAE" "UGANDA"
[31] "AUSTRALIA" "YEMEN"
[33] "NEW ZEALAND" "BELGIUM"
[35] "NORWAY" "ZIMBABWE"
[37] "ZAMBIA" "CONGO"
[39] "BURGARIA" "PAKISTAN"
[41] "GREECE" "MAURITIUS"
[43] "DRC" "OMAN"
[45] "PORTUGAL" "KOREA"
[47] "SWAZILAND" "TUNISIA"
[49] "KUWAIT" "DOMINICA"
[51] "ISRAEL" "FINLAND"
[53] "CZECH REPUBLIC" "UKRAIN"
[55] "ETHIOPIA" "BURUNDI"
[57] "SCOTLAND" "RUSSIA"
[59] "GHANA" "NIGER"
[61] "MALAYSIA" "COLOMBIA"
[63] "LUXEMBOURG" "NEPAL"
[65] "POLAND" "SINGAPORE"
[67] "LITHUANIA" "HUNGARY"
[69] "INDONESIA" "TURKEY"
[71] "TRINIDAD TOBACCO" "IRAQ"
[73] "SLOVENIA" "UNITED ARAB EMIRATES"
[75] "COMORO" "SRI LANKA"
[77] "IRAN" "MONTENEGRO"
[79] "ANGOLA" "LEBANON"
[81] "SLOVAKIA" "ROMANIA"
[83] "MEXICO" "LATVIA"
[85] "CROATIA" "CAPE VERDE"
[87] "SUDAN" "COSTARICA"
[89] "NAMIBIA" "TAIWAN"
[91] "SERBIA" "LESOTHO"
[93] "GEORGIA" "PHILIPINES"
[95] "IVORY COAST" "MADAGASCAR"
[97] "DJIBOUT" "CYPRUS"
[99] "ARGENTINA" "URUGUAY"
[101] "MORROCO" "THAILAND"
[103] "CHILE" "BERMUDA"
[105] "ESTONIA"
touristdata_clean <- touristdata_no_na %>%
mutate(country = case_when(country == "BURGARIA" ~ "BULGARIA",
country == "DJIBOUT" ~ "DJIBOUTI",
country == "MALT" ~ "MALTA",
country == "MORROCO" ~ "MOROCCO",
country == "PHILIPINES" ~ "PHILIPPINES",
country == "SCOTLAND" ~ "UNITED KINGDOM",
country == "SWIZERLAND" ~ "SWITZERLAND",
TRUE ~ country)) %>%
mutate(region = countrycode(sourcevar = country,
origin = "country.name",
destination = "continent"),
code = countrycode(sourcevar = country,
origin = "country.name",
destination = "iso3c"),
.after = country)
touristdata_clean# A tibble: 4,762 × 28
ID country region code age_g…¹ trave…² total…³ total…⁴ total…⁵ purpose
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
1 tour_0 SWITZE… Europe CHE 45-64 Friend… 1 1 2 Leisur…
2 tour_10 UNITED… Europe GBR 25-44 Alone 1 0 1 Leisur…
3 tour_10… UNITED… Europe GBR 25-44 Alone 0 1 1 Visiti…
4 tour_10… UNITED… Europe GBR 25-44 Spouse 1 1 2 Leisur…
5 tour_10… CHINA Asia CHN 1-24 Alone 1 0 1 Leisur…
6 tour_10… UNITED… Europe GBR 25-44 Alone 0 1 1 Leisur…
7 tour_10… SOUTH … Africa ZAF 45-64 Alone 0 1 1 Busine…
8 tour_10… UNITED… Ameri… USA 45-64 Friend… 1 1 2 Leisur…
9 tour_101 NIGERIA Africa NGA 25-44 Alone 0 1 1 Leisur…
10 tour_10… INDIA Asia IND 25-44 Alone 1 0 1 Busine…
# … with 4,752 more rows, 18 more variables: main_activity <chr>,
# info_source <chr>, tour_arrangement <chr>, package_transport_int <fct>,
# package_accomodation <fct>, package_food <fct>, package_transport_tz <fct>,
# package_sightseeing <fct>, package_guided_tour <fct>,
# package_insurance <fct>, night_mainland <dbl>, night_zanzibar <dbl>,
# total_night_spent <dbl>, prop_night_spent_mainland <dbl>,
# payment_mode <chr>, first_trip_tz <fct>, most_impressing <chr>, …
#write_csv(touristdata_clean,'C:/michaeldjo/VAP/DataPrep/data/touristdata_clean.csv')