Confirmatory Data Analysis

Author

Michael Djohan

Published

March 11, 2023

Modified

March 25, 2023

1. Installing and launching the required R packages

pacman::p_load("tmap", "ExPanDaR", "kableExtra", "ggstatsplot", "plotly", "DT", "scales","tidyverse")

2. Loading the data

touristdata_clean <- read_csv("data/touristdata_clean.csv")

2.1 Adding cost per pax

touristdata_clean <- touristdata_clean %>%
  filter(total_cost > 0,
         total_tourist > 0,
         total_night_spent > 0) %>%
  mutate(cost_per_pax = round(total_cost/total_tourist,0),
         cost_per_night = round(total_cost/total_night_spent,0),
         cost_per_pax_night = round(total_cost/total_tourist/total_night_spent,0))

3. Data overview

3.1 Desriptive Statistics

descr <- prepare_descriptive_table(touristdata_clean %>%
                                       select(!c(14,15,16,17,18,19,20,26))
                                   ) 


datatable(descr$df, class= "hover") %>%
  formatRound(column = c("Mean", "Std. dev.", "Median"), digits=2)

3.2 Distribution of numerical variables

plot <- touristdata_clean %>% 
  select(c(7,8,9,21,22,23,24,28,29,30,31)) %>% 
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap( ~key, ncol=4, scales="free") +
  geom_histogram()

ggplotly(plot)
0e+001e+072e+073e+074e+075e+0701000200030000.0e+002.5e+075.0e+077.5e+0705001000150020000e+001e+072e+073e+070100020003000050100150050010001500020406001000200030000.000.250.500.751.0001000200030000.0e+002.5e+075.0e+077.5e+071.0e+08050010001500010203040500100020003000010203040010002000300005010015005001000150002550750500100015002000
valuecountcost_per_nightcost_per_paxcost_per_pax_nightnight_mainlandnight_zanzibarprop_night_spent_mainlandtotal_costtotal_femaletotal_maletotal_night_spenttotal_tourist

Note:

  1. Majority spent on mainland
  2. Mostly travel <5 people
  3. Spent 5 days on median

3.3 Distribution of categorical variables

plot <- touristdata_clean %>% 
  select(c(3,5,6,10,11,12,13,14,15,16,17,18,19,20,25,26,27)) %>% 
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap( ~key, ncol=3, scales="free") +
  geom_bar()

ggplotly(plot)
1-2425-4445-6465+05001000150020002500010100020003000Friends, relativesinflight magazinesNewspaper, magazines,brochuresothersRadio, TV, WebTanzania Mission AbroadTrade fairTravel, agent, tour operator050010001500Beach tourismBird watchingbusinessConference tourismCultural tourismDiving and Sport FishingHunting tourismMountain climbingWildlife tourism0500100015002000Excellent ExperienceFriendly PeopleGood serviceNo commentsSatisfies and Hope Come BackWildlifeWonderful Country, Landscape, Nature05001000150001010002000010100020000101000200030000101000200030004000010100020003000010100020003000010100020003000CashCredit CardOtherTravellers Cheque01000200030004000BusinessLeisure and HolidaysMeetings and ConferenceOtherScientific and AcademicVisiting Friends and RelativesVolunteering010002000AfricaAmericasAsiaEuropeOceania0500100015002000IndependentPackage Tour010002000AloneChildrenFriends/RelativesSpouseSpouse and Children0500100015002000
valuecountage_groupfirst_trip_tzinfo_sourcemain_activitymost_impressingpackage_accomodationpackage_foodpackage_guided_tourpackage_insurancepackage_sightseeingpackage_transport_intpackage_transport_tzpayment_modepurposeregiontour_arrangementtravel_with

Note:

  1. Many younger tourist (25-44)
  2. Info mainly comes from friends and travel agents
  3. Many comes for wildlife tourism and beach tourism
  4. Many European and African Tourists
  5. Many travel alone
  6. Many comes for leisure and holidays

4. Exploring by Regions and Countries

4.1 Plotting the choropleth map for aggregated metrics

The object World is a spatial object of class sf from the sf package; it is a data.frame with a special column that contains a geometry for each row, in this case polygons

Reference - https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.html

data("World")
touristdata_clean_country <- touristdata_clean %>%
  group_by(country,code,region) %>%
  summarise(total_female = sum(total_female),
            total_male = sum(total_male),
            total_tourist = sum(total_tourist),
            total_cost = round(sum(total_cost),0),
            total_night_spent = round(sum(total_night_spent),0),
            cost_per_pax = round(mean(cost_per_pax),0),
            cost_per_night = round(mean(cost_per_night),0),
            cost_per_pax_night = round(mean(cost_per_pax_night),0),
            trips = n()) %>%
  mutate(avg_night_spent = round(total_night_spent/trips,0)) %>%
  ungroup()

Joining the two dataframes together

touristdata_clean_map <- left_join(World, 
                                 touristdata_clean_country, 
                          by = c("iso_a3" = "code")) %>%
  select(-c(2:15)) %>%
  na.omit()
plot_map_eda <- function(metric = "total_tourist", style = "jenks", classes = 5, minvisitors = 30){

metric_text = case_when(metric == "total_female" ~ "Number of Female Visitors",
                        metric == "total_male" ~ "Number of Male Visitors",
                        metric == "total_tourist" ~ "Number of Visitors",
                        metric == "total_cost" ~ "Total Spending (TZS)",
                        metric == "cost_per_pax" ~ "Average Individual Spending (TZS)",
                        metric == "cost_per_night" ~ "Average Spending per Night (TZS)",
                        metric == "cost_per_pax_night" ~ "Average Individual Spending per Night (TZS)")
  
tmap_mode("view")

tm_shape(touristdata_clean_map %>%
           filter(total_tourist >= minvisitors))+
  tm_fill(metric, 
          n = classes,
          style = style, 
          palette="YlGn", 
          id = "country",
          title = metric_text
          ) +
  
  tm_borders(col = "grey20",
             alpha = 0.5) 

}
plot_map_eda(metric = "cost_per_pax_night", style = "jenks", classes = 6, minvisitors = 20)
Average Individual Spending per Night (TZS)
101,316 to 150,444
150,444 to 405,977
405,977 to 574,706
574,706 to 724,281
724,281 to 891,004
891,004 to 1,389,025
Leaflet | Tiles © Esri — Esri, DeLorme, NAVTEQ

Note:

  1. Majority of tourists come from US, Western Europe, and South Africa
  2. Many outliers in individual average spending, so only focus on countries with at least 20 visitors, then it’s revealed that Australians spend the highest, followed by Americans and Canadians.

4.2 CDA by Regions

4.2.1 For Numerical Variables

plot_ANOVA_region <- function(metric = "cost_per_pax_night", minvisitors = 30, testtype = "np", pair = "ns", compare = T, conf = 0.95, nooutliers = T) {
  
metric_text = case_when(metric == "total_female" ~ "Number of Female Visitors",
                        metric == "total_male" ~ "Number of Male Visitors",
                        metric == "total_tourist" ~ "Number of Visitors",
                        metric == "total_cost" ~ "Total Spending (TZS)",
                        metric == "cost_per_pax" ~ "Individual Spending (TZS)",
                        metric == "cost_per_night" ~ "Spending per Night (TZS)",
                        metric == "cost_per_pax_night" ~ "Individual Spending per Night (TZS)",
                        metric == "prop_night_spent_mainland" ~ "Proportion of Night spent in Mainland Tanzania",
                        TRUE ~ metric)

paratext <- case_when(testtype == "p" ~ "Mean (Parametric)",
                      testtype == "np" ~ "Median (Non-Parametric)",
                      testtype == "r" ~ "Mean (Robust t-test)",
                      testtype == "bayes" ~ "Mean (Bayesian)",
                      )

touristdata_countrylist <- touristdata_clean_country %>%
  filter(total_tourist >= minvisitors)

countrylist <- unique(touristdata_countrylist$country)

touristdata_ANOVA <- touristdata_clean %>%
  filter(country %in% countrylist) %>%
  mutate(region = fct_reorder(region, !!sym(metric), median, .desc = TRUE)) %>%
  drop_na()

if(nooutliers == T){
  touristdata_ANOVA <- touristdata_ANOVA %>%
      treat_outliers() 
}

touristdata_ANOVA %>% 
  ggbetweenstats(x = region, y = !!sym(metric),
                 xlab = "Region", ylab = metric_text,
                 type = testtype, pairwise.comparisons = T, pairwise.display = pair, 
                 mean.ci = T, p.adjust.method = "fdr",  conf.level = conf,
                 title = paste0("Comparison of ",paratext," ",metric_text, " across Regions"),
                 package = "ggthemes", palette = "Tableau_10")

}
plot_ANOVA_region(metric = "cost_per_pax_night", minvisitors = 30, testtype = "np", pair = "ns", compare = T, conf = 0.95, nooutliers = T)

Note: Outliers can be treated Ability to exclude countries with less than minvisitors limit

  1. Oceania visitors spend the most while African visitors tends to spend the least
plot_ANOVA_region(metric = "prop_night_spent_mainland", minvisitors = 30, testtype = "np", pair = "ns", conf = 0.95)

Note : European customers spend more time in Zanzibar

4.2.2 For Categorical Variables

plot_barstats_region <- function(metric = "purpose", minvisitors = 30, testtype = "np",conf = 0.95) {
  
metric_text = case_when(metric == "age_group" ~ "Age Group",
                        metric == "travel_with" ~ "Travelling Companion",
                        metric == "purpose" ~ "Purpose",
                        metric == "main_activity" ~ "Main Activity",
                        metric == "info_source" ~ "Source of Information",
                        metric == "tour_arrangement" ~ "Tour Arrangement",
                        metric == "package_transport_int" ~ "Include International Transportation?",
                        metric == "package_accomodation" ~ "Include accomodation service?",
                        metric == "package_food" ~ "Include food service?",
                        metric == "package_transport_tz" ~ "Include domestic transport service?",
                        metric == "package_sightseeing" ~ "Include sightseeing service?",
                        metric == "package_guided_tour" ~ "Include guided tour?",
                        metric == "package_insurance" ~ "Insurance Included?",
                        metric == "payment_mode" ~ "Mode of Payment",
                        metric == "first_trip_tz" ~ "First trip to Tanzania?",
                        metric == "most_impressing" ~ "Most impressive about Tanzania?",
                        TRUE ~ metric)

touristdata_countrylist <- touristdata_clean_country %>%
  filter(total_tourist >= minvisitors)

countrylist <- unique(touristdata_countrylist$country)

touristdata_barstats <- touristdata_clean %>%
  filter(country %in% countrylist) %>%
  drop_na()

touristdata_barstats %>% 
  ggbarstats(x = !!sym(metric), y = region,
             xlab = "Region", legend.title = metric_text,
             type = testtype, conf.level = conf,
             palette = "Set2")

}
plot_barstats_region(metric = "travel_with", minvisitors = 30, testtype = "np", conf = 0.95)

plot_barstats_region(metric = "purpose", minvisitors = 30, testtype = "np", conf = 0.95)

plot_barstats_region(metric = "package_insurance", minvisitors = 30, testtype = "np", conf = 0.95)

plot_barstats_region(metric = "first_trip_tz", minvisitors = 30, testtype = "np", conf = 0.95)

4.3 CDA by Selected Countries

Examining the largest spender by country

touristdata_clean_country_sorted <- touristdata_clean_country %>%
  filter(total_cost > 0,
         total_tourist >= 60) %>%
  arrange(desc(cost_per_pax_night))

top8 <- touristdata_clean_country_sorted$country[1:8]

touristdata_clean_country_sorted
# A tibble: 24 × 13
   country  code  region total…¹ total…² total…³ total…⁴ total…⁵ cost_…⁶ cost_…⁷
   <chr>    <chr> <chr>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1 AUSTRAL… AUS   Ocean…     185     126     311  2.72e9    1970  9.58e6 2134147
 2 SOUTH A… ZAF   Africa     177     258     435  2.57e9    1707  5.78e6 2138016
 3 UNITED … USA   Ameri…     696     632    1328  8.74e9    7602  7.08e6 1571607
 4 CANADA   CAN   Ameri…     119      88     207  1.42e9    1746  7.45e6 1240173
 5 DENMARK  DNK   Europe      40      34      74  5.97e8     839  1.09e7  914157
 6 SWITZER… CHE   Europe      74      63     137  7.08e8     849  5.39e6  956006
 7 SWEDEN   SWE   Europe      35      32      67  2.96e8     401  4.93e6  802780
 8 SPAIN    ESP   Europe     230     178     408  1.61e9    1816  4.21e6 1235937
 9 ITALY    ITA   Europe     467     464     931  3.64e9    3888  4.06e6 1049569
10 FRANCE   FRA   Europe     374     355     729  3.32e9    3581  4.74e6 1169800
# … with 14 more rows, 3 more variables: cost_per_pax_night <dbl>, trips <int>,
#   avg_night_spent <dbl>, and abbreviated variable names ¹​total_female,
#   ²​total_male, ³​total_tourist, ⁴​total_cost, ⁵​total_night_spent,
#   ⁶​cost_per_pax, ⁷​cost_per_night

4.3.1 For Numerical Variables

plot_ANOVA_country <- function(metric = "cost_per_pax_night", selected_countries, testtype = "np", pair = "ns", compare = T, conf = 0.95, nooutliers = T) {
  
metric_text = case_when(metric == "total_female" ~ "Number of Female Visitors",
                        metric == "total_male" ~ "Number of Male Visitors",
                        metric == "total_tourist" ~ "Number of Visitors",
                        metric == "total_cost" ~ "Total Spending (TZS)",
                        metric == "cost_per_pax" ~ "Individual Spending (TZS)",
                        metric == "cost_per_night" ~ "Spending per Night (TZS)",
                        metric == "cost_per_pax_night" ~ "Individual Spending per Night (TZS)",
                        metric == "prop_night_spent_mainland" ~ "Proportion of Night spent in Mainland Tanzania",
                        TRUE ~ metric)

paratext <- case_when(testtype == "p" ~ "Mean (Parametric)",
                      testtype == "np" ~ "Median (Non-Parametric)",
                      testtype == "r" ~ "Mean (Robust t-test)",
                      testtype == "bayes" ~ "Mean (Bayesian)",
                      )

touristdata_ANOVA <- touristdata_clean %>%
  filter(country %in% selected_countries) %>%
  mutate(country = fct_reorder(country, !!sym(metric), median, .desc = TRUE)) %>%
  drop_na()

if(nooutliers == T){
  touristdata_ANOVA <- touristdata_ANOVA %>%
      treat_outliers() 
}

touristdata_ANOVA %>% 
  ggbetweenstats(x = country, y = !!sym(metric),
                 xlab = "Country", ylab = metric_text,
                 type = testtype, pairwise.comparisons = compare, pairwise.display = pair, 
                 mean.ci = T, p.adjust.method = "fdr",  conf.level = conf,
                 title = paste0("Comparison of ",paratext," ",metric_text, " across Countries"),
                 package = "ggthemes", palette = "Tableau_10")

}
plot_ANOVA_country(metric = "cost_per_pax_night", selected_countries = top8, testtype = "np", pair = "ns", compare = F, conf = 0.95, nooutliers = T)

Majority of big spender indeed comes from Australia, followed by US, Canada, New Zealand, Denmark, and South Africa. However, it is noted that there is no statistically significant difference among many of them

plot_ANOVA_country(metric = "prop_night_spent_mainland", selected_countries = top8, testtype = "np", pair = "ns", conf = 0.95)

4.3.2 For Categorical Variables

plot_barstats_country <- function(metric = "purpose", selected_countries, testtype = "np",conf = 0.95) {
  
metric_text = case_when(metric == "age_group" ~ "Age Group",
                        metric == "travel_with" ~ "Travelling Companion",
                        metric == "purpose" ~ "Purpose",
                        metric == "main_activity" ~ "Main Activity",
                        metric == "info_source" ~ "Source of Information",
                        metric == "tour_arrangement" ~ "Tour Arrangement",
                        metric == "package_transport_int" ~ "Include International Transportation?",
                        metric == "package_accomodation" ~ "Include accomodation service?",
                        metric == "package_food" ~ "Include food service?",
                        metric == "package_transport_tz" ~ "Include domestic transport service?",
                        metric == "package_sightseeing" ~ "Include sightseeing service?",
                        metric == "package_guided_tour" ~ "Include guided tour?",
                        metric == "package_insurance" ~ "Insurance Included?",
                        metric == "payment_mode" ~ "Mode of Payment",
                        metric == "first_trip_tz" ~ "First trip to Tanzania?",
                        metric == "most_impressing" ~ "Most impressive about Tanzania?",
                        TRUE ~ metric)


touristdata_barstats <- touristdata_clean %>%
  filter(country %in% selected_countries) %>%
  drop_na()

touristdata_barstats %>% 
  ggbarstats(x = !!sym(metric), y = country,
             xlab = "Country", legend.title = metric_text,
             type = testtype, conf.level = conf,
             palette = "Set2")

}
plot_barstats_country(metric = "travel_with", selected_countries = top8, testtype = "np", conf = 0.95)

plot_barstats_country(metric = "purpose", selected_countries = top8, testtype = "np", conf = 0.95)

plot_barstats_country(metric = "package_insurance", selected_countries = top8, testtype = "np", conf = 0.95)

plot_barstats_country(metric = "first_trip_tz", selected_countries = top8, testtype = "np", conf = 0.95)

5. Exploring Spending vs Other Variables

5.1 For Numerical Variables

plot_scatter <- function(xaxis = "prop_night_spent_mainland", yaxis = "cost_per_pax_night", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), max_xaxis, testtype = "np", conf = 0.95, nooutliers = T) {
  
xaxis_text = case_when(xaxis == "total_female" ~ "Number of Female Visitors",
                       xaxis == "total_male" ~ "Number of Male Visitors",
                       xaxis == "total_tourist" ~ "Number of Visitors",
                       xaxis == "prop_night_spent_mainland" ~ "Proportion of Night spent in Mainland Tanzania",
                       xaxis == "total_night_spent" ~ "Total Night Spent",
                       TRUE ~ xaxis)

yaxis_text = case_when(yaxis == "total_cost" ~ "Total Spending (TZS)",
                       yaxis == "cost_per_pax" ~ "Individual Spending (TZS)",
                       yaxis == "cost_per_night" ~ "Spending per Night (TZS)",
                       yaxis == "cost_per_pax_night" ~ "Individual Spending per Night (TZS)",
                       TRUE ~ yaxis)

if(nooutliers == T){
  touristdata_clean <- touristdata_clean %>%
      treat_outliers() 
}

touristdata_clean %>%
  filter(region %in% regionlist,
         !!sym(xaxis) <= max_xaxis) %>%
  ggscatterstats(x = !!sym(xaxis), y = !!sym(yaxis),
                 type = testtype, conf.level = conf) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(x = xaxis_text, y = yaxis_text)
  
}
plot_scatter(xaxis = "total_tourist", yaxis = "total_cost", regionlist = c("Europe","Americas","Oceania"), max_xaxis = 100, testtype = "np", conf = 0.95, nooutliers = T)

5.2 For Categorical Variables

plot_ANOVA <- function(xaxis = "age_group", yaxis = "total_cost", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), testtype = "np", pair = "ns",conf = 0.95, nooutliers = T) {
  
xaxis_text = case_when(xaxis == "age_group" ~ "Age Group",
                        xaxis == "travel_with" ~ "Travelling Companion",
                        xaxis == "purpose" ~ "Purpose",
                        xaxis == "main_activity" ~ "Main Activity",
                        xaxis == "info_source" ~ "Source of Information",
                        xaxis == "tour_arrangement" ~ "Tour Arrangement",
                        xaxis == "package_transport_int" ~ "Include International Transportation?",
                        xaxis == "package_accomodation" ~ "Include accomodation service?",
                        xaxis == "package_food" ~ "Include food service?",
                        xaxis == "package_transport_tz" ~ "Include domestic transport service?",
                        xaxis == "package_sightseeing" ~ "Include sightseeing service?",
                        xaxis == "package_guided_tour" ~ "Include guided tour?",
                        xaxis == "package_insurance" ~ "Insurance Included?",
                        xaxis == "payment_mode" ~ "Mode of Payment",
                        xaxis == "first_trip_tz" ~ "First trip to Tanzania?",
                        xaxis == "most_impressing" ~ "Most impressive about Tanzania?",
                        TRUE ~ xaxis)

yaxis_text = case_when(yaxis == "total_cost" ~ "Total Spending (TZS)",
                       yaxis == "cost_per_pax" ~ "Individual Spending (TZS)",
                       yaxis == "cost_per_night" ~ "Spending per Night (TZS)",
                       yaxis == "cost_per_pax_night" ~ "Individual Spending per Night (TZS)",
                       TRUE ~ yaxis)


touristdata_ANOVA <- touristdata_clean %>%
  filter(region %in% regionlist) %>%
  drop_na()

if(nooutliers == T){
  touristdata_ANOVA <- touristdata_ANOVA %>%
      treat_outliers() 
}

touristdata_ANOVA %>% 
  ggbetweenstats(x = !!sym(xaxis), y = !!sym(yaxis),
                 xlab = xaxis_text, ylab = yaxis_text,
                 type = testtype, pairwise.comparisons = T, pairwise.display = pair, 
                 mean.ci = T, p.adjust.method = "fdr",  conf.level = conf,
                 package = "ggthemes", palette = "Tableau_10")

}
plot_ANOVA(xaxis = "travel_with", yaxis = "total_cost", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), testtype = "np", pair = "ns",conf = 0.95, nooutliers = T)

plot_ANOVA(xaxis = "age_group", yaxis = "total_cost", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), testtype = "np", pair = "ns",conf = 0.95)

plot_ANOVA(xaxis = "purpose", yaxis = "total_cost", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), testtype = "np", pair = "ns",conf = 0.95)

6. Exploring Other Variables

plot_barstats <- function(xaxis = "age_group", yaxis = "purpose", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), testtype = "np",conf = 0.95) {
  
xaxis_text = case_when(xaxis == "age_group" ~ "Age Group",
                        xaxis == "travel_with" ~ "Travelling Companion",
                        xaxis == "purpose" ~ "Purpose",
                        xaxis == "main_activity" ~ "Main Activity",
                        xaxis == "info_source" ~ "Source of Information",
                        xaxis == "tour_arrangement" ~ "Tour Arrangement",
                        xaxis == "package_transport_int" ~ "Include International Transportation?",
                        xaxis == "package_accomodation" ~ "Include accomodation service?",
                        xaxis == "package_food" ~ "Include food service?",
                        xaxis == "package_transport_tz" ~ "Include domestic transport service?",
                        xaxis == "package_sightseeing" ~ "Include sightseeing service?",
                        xaxis == "package_guided_tour" ~ "Include guided tour?",
                        xaxis == "package_insurance" ~ "Insurance Included?",
                        xaxis == "payment_mode" ~ "Mode of Payment",
                        xaxis == "first_trip_tz" ~ "First trip to Tanzania?",
                        xaxis == "most_impressing" ~ "Most impressive about Tanzania?",
                        TRUE ~ xaxis)

yaxis_text = case_when(yaxis == "age_group" ~ "Age Group",
                        yaxis == "travel_with" ~ "Travelling Companion",
                        yaxis == "purpose" ~ "Purpose",
                        yaxis == "main_activity" ~ "Main Activity",
                        yaxis == "info_source" ~ "Source of Information",
                        yaxis == "tour_arrangement" ~ "Tour Arrangement",
                        yaxis == "package_transport_int" ~ "Include International Transportation?",
                        yaxis == "package_accomodation" ~ "Include accomodation service?",
                        yaxis == "package_food" ~ "Include food service?",
                        yaxis == "package_transport_tz" ~ "Include domestic transport service?",
                        yaxis == "package_sightseeing" ~ "Include sightseeing service?",
                        yaxis == "package_guided_tour" ~ "Include guided tour?",
                        yaxis == "package_insurance" ~ "Insurance Included?",
                        yaxis == "payment_mode" ~ "Mode of Payment",
                        yaxis == "first_trip_tz" ~ "First trip to Tanzania?",
                        yaxis == "most_impressing" ~ "Most impressive about Tanzania?",
                        TRUE ~ yaxis)


touristdata_barstats <- touristdata_clean %>%
  filter(region %in% regionlist) %>%
  drop_na()

touristdata_barstats %>% 
  ggbarstats(x = !!sym(yaxis), y = !!sym(xaxis),
             xlab = xaxis_text, legend.title = yaxis_text,
             type = testtype, conf.level = conf,
             palette = "Set2")

}
plot_barstats(xaxis = "age_group", yaxis = "purpose", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), testtype = "np",conf = 0.95)

plot_barstats(xaxis = "main_activity", yaxis = "tour_arrangement", regionlist = c("Asia", "Africa", "Europe", "Oceania", "Americas"), testtype = "np",conf = 0.95)