Visualizing Survey Data with R

Zhaowen Guo

Agenda

  • Visualize descriptive patterns in the data

    • Bar plot and its extensions
  • Visualize fitted models, inferences, and final results

    • Frequentist estimation (survey package)

    • Bayesian estimation (rstanarm package)

Tidyverse approach

  • Organize your data in a tidy format
  • Load the tidyverse package

    • dplyr: data wrangling

    • ggplot2: data visualization

    • stringr: string manipulation

  • Ingredients of a ggplot2 plot

    • data: tidy dataset

    • aesthetics: how variables map to visual properties (x, y, color, etc.)

    • geom objects: type of plot you’re making (geom_bar(), geom_point(), etc.)

An illustration

We will use the American Trends Panel Wave 99 (conducted Nov. 1-7, 2021) from Pew Research Center, which focuses on public attitudes on artificial intelligence (AI) and human enhancement.

Data preparation

Load packages

options(scipen = 999) # prevent scientific notation like e
library(foreign) # read sav files
library(survey) # survey data analysis with complex designs
library(rstanarm) # bayesian applied regression modeling
library(ggtext) # highlight text
library(tidyverse) # data wrangling and visualization

Load data

pew <- read.spss("ATP W99.sav", to.data.frame = T)
# alternative: pew <- haven::read_sav("ATP W99.sav")
  • When using foreign package, categorical variables are imported as factors

  • When using haven package, categorical variables are imported as labelled class variables (which are sometimes difficult to wrangle)

    • Sometimes we need to convert labelled variables into factors via haven::as_factor()

Data Preparation

Inspect data

  • How many variables and observations?
  • What types of variables do you have?
  • Are categorical variables properly formatted?
  • Are there missing values that might affect results?
  • Do you know where the weights and key identifiers are?
# check data structure 
str(pew) 
# check missing values 
colSums(is.na(pew)) %>% sort(decreasing = TRUE) 
# inspect survey weights 
summary(pew$fWEIGHT_W99)

Visualize descriptive patterns: bar plot

table(pew$FACEREC2_W99)

Good idea for society  Bad idea for society              Not sure 
                 2404                  1373                  1362 
              Refused 
                   14 
pew_clean <- pew %>%
  filter(!is.na(FACEREC2_W99), FACEREC2_W99 != "Refused") %>%
  droplevels()

prop.table(table(pew_clean$FACEREC2_W99))

Good idea for society  Bad idea for society              Not sure 
            0.4677953             0.2671726             0.2650321 
ggplot(pew_clean, aes(x = FACEREC2_W99)) +
  geom_bar(fill = "grey60") +
  labs(
    title = "Public Opinion on Police Use of Facial Recognition Technology",
    subtitle = "Pew American Trends Panel, Wave 99 (Nov. 2021)",
    x = "",
    y = "Number of Respondents"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))

pew_clean_prop <- pew_clean %>%
  count(FACEREC2_W99) %>%
  mutate(prop = n / sum(n))

ggplot(pew_clean_prop, aes(x = FACEREC2_W99, y = prop)) +
  geom_col(fill = "grey60") +
  scale_y_continuous(labels = scales::percent) +
  geom_text(aes(label = scales::percent(prop, accuracy = 1)),
            vjust = -0.3, size = 3) +
  labs(
    title = "Public Opinion on Police Use of Facial Recognition Technology",
    subtitle = "Pew American Trends Panel, Wave 99 (Nov. 2021)",
    x = "",
    y = "Proportion of Respondents"
  ) +
  theme_minimal()

table(pew$F_PARTY_FINAL)

    Republican       Democrat    Independent Something else        Refused 
          3174           3331           2717            937            101 
pew_clean <- pew_clean %>%
  filter(!F_PARTY_FINAL %in% c("Something else", "Refused")) %>%
  droplevels()

party_colors <- c(
  "Democrat" = "#4F81BD",     
  "Republican" = "#C0504D",   
  "Independent" = "#9BBB59" 
)

ggplot(pew_clean, aes(x = FACEREC2_W99, fill = F_PARTY_FINAL)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = party_colors) +
  labs(
    title = "Public Opinion on Police Use of Facial Recognition Technology by Partisanship",
    x = "",
    y = "Number of Respondents",
    fill = "Partisanship"
  ) +
  theme_minimal()

table(pew$F_GENDER)

            A man           A woman In some other way           Refused 
             4549              5628                60                23 
pew_clean <- pew_clean %>%
  filter(!F_GENDER %in% c("In some other way", "Refused")) %>%
  droplevels() %>%
  mutate(F_GENDER = forcats::fct_recode(F_GENDER,
    "Female" = "A woman",
    "Male" = "A man"
  ))

ggplot(pew_clean, aes(x = FACEREC2_W99)) +
  geom_bar(fill = "grey60", width = 0.5) +
  facet_wrap(~ F_GENDER, ncol = 1) +  
  labs(
    title = "Public Opinion on Police Use of Facial Recognition Technology by Gender",
    x = "",
    y = "Number of Respondents"
  ) +
  theme_minimal()

Visualize descriptive patterns: advanced bar plot

# estimate proportions for three AI-related questions using survey design

pew_design <- svydesign(ids = ~0, # ~0 or ~1 indicates no clusters
                        weights = ~WEIGHT_W99, # calibration weights
                        data = pew)

q1 <- svymean(~FACEREC2_W99, # facial recognition
              design = pew_design,
              na.rm = T)
q2 <- svymean(~SMALG2_W99, # algorithms
              design = pew_design,
              na.rm = T)
q3 <- svymean(~DCARS2_W99, # driverless passenger vehicles
              design = pew_design,
              na.rm = T)
# combine and label proportion estimates for plotting

ai_society <- tibble(
  prop = round(c(q1[-4], q2[-4], q3[-4]) * 100, 0),
  type = rep(c("Facial recognition", "Algorithms", "Driverless vehicles"), each = 3),
  response = rep(c("Good idea for society", "Bad idea for society", "Not sure"), times = 3)
) %>%
  mutate(
    type = factor(type, levels = c("Driverless vehicles", "Algorithms", "Facial recognition")),
    response = factor(response, levels = c("Not sure", "Bad idea for society", "Good idea for society"))
  )
ai_society %>%
  ggplot(aes(x = type, y = prop, fill = response)) +
  geom_col(width = 0.4) +

  # add percentage labels inside bars
  geom_text(
    aes(label = prop, color = response),
    position = position_stack(vjust = 0.5),
    size = 3.5
  ) +

  # flip to horizontal bars
  coord_flip() +

  # custom fill and text color
  scale_fill_manual(
    values = c(
      "Good idea for society" = "#2f6ead",
      "Bad idea for society" = "#95a644",
      "Not sure" = "#d6d6d6"
    ),
    name = ""
  ) +
  scale_color_manual(
    values = c(
      "Good idea for society" = "white",
      "Bad idea for society" = "white",
      "Not sure" = "black"
    )
  ) +

  # titles and labels
  labs(
    title = str_wrap("The public is more open to applications of human enhancement and AI", 80),
    subtitle = str_wrap("% of U.S. adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a...", 100),
    x = NULL,
    y = NULL
  ) +

  # reverse legend order to match stacking order
  guides(fill = guide_legend(reverse = TRUE), color = "none") +

  # clean minimal theme
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "top",
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    axis.text.x = element_blank(),
    plot.title = element_text(size = 13, face = "bold", hjust = 0),
    plot.subtitle = element_text(size = 10, color = "grey40", hjust = 0, face = "italic")
  )

ggplot(ai_society, aes(x = type, y = prop, fill = response)) +
  geom_col(width = 0.4) +
  coord_flip() +

  # dashed lines below each category label
  geom_vline(xintercept = c(1.5, 2.5), linetype = "dashed", color = "grey60", linewidth = 0.4) +

  # label inside bars, right aligned
  geom_text(
    aes(label = prop, color = response),
    position = position_stack(vjust = 0.5),
    hjust = 1.05,
    size = 4
  ) +

  # facet by response to mimic Pew's column layout
  facet_wrap(~ fct_rev(response), nrow = 1) +

  # custom fill and label colors (matching Pew palette)
  scale_fill_manual(
    values = c(
      "Good idea for society" = "#2f6ead",     
      "Bad idea for society"  = "#95a644",     
      "Not sure"              = "#d6d6d6"      
    ),
    name = NULL
  ) +
  scale_color_manual(
    values = c(
      "Good idea for society" = "white",
      "Bad idea for society"  = "white",
      "Not sure"              = "black"
    )
  ) +

  # axis and title formatting
  labs(
    title = str_wrap("Majority says brain chip implants for improved cognitive abilities would be bad idea for society; public more open to other applications of human enhancement and AI", 100), 
    subtitle = str_wrap("% of U.S. adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a ...", 120),
    x = NULL,
    y = NULL
  ) +

  theme_minimal(base_size = 12) +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks = element_blank(),
    legend.position = "none",
    strip.background = element_blank(),
    strip.text = element_text(face = "bold", size = 11, hjust = 0.5),
    plot.title = element_text(size = 13, face = "bold", hjust = 0),
    plot.subtitle = element_text(size = 10, color = "grey30", hjust = 0, face = "italic")
  )

Visualize models and results: survey package

Y: Whether an individual believes that social media companies’ use of algorithms to detect false information leads to censorship

X: Confidence in social media companies

# recode variables
pew_updated <- pew %>%
  mutate(
    conf_company = case_when(
      SMALG7_W99 == "No confidence at all" ~ 1,
      SMALG7_W99 == "Not too much confidence" ~ 2,
      SMALG7_W99 == "A fair amount of confidence" ~ 3,
      SMALG7_W99 == "A great deal of confidence" ~ 4
    ),
    censored_view = case_when(
      SMALG4_b_W99 %in% c("Probably NOT happening", "Definitely NOT happening") ~ 0,
      SMALG4_b_W99 %in% c("Probably happening", "Definitely happening") ~ 1
    )
  )

# update survey design with added variables
pew_design <- update(pew_design, 
                     conf_company = pew_updated$conf_company,
                     censored_view = pew_updated$censored_view)

# define a logistic regression model
mod1 <- svyglm(censored_view ~ conf_company, 
               family = quasibinomial(link = "logit"), 
               design = pew_design)
summary(mod1)

Call:
svyglm(formula = censored_view ~ conf_company, design = pew_design, 
    family = quasibinomial(link = "logit"))

Survey design:
update(pew_design, conf_company = pew_updated$conf_company, censored_view = pew_updated$censored_view)

Coefficients:
             Estimate Std. Error t value             Pr(>|t|)    
(Intercept)   1.39690    0.14292   9.774 < 0.0000000000000002 ***
conf_company -0.23701    0.06647  -3.566             0.000366 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasibinomial family taken to be 0.9888884)

Number of Fisher Scoring iterations: 4
# compute predicted probabilities
new_data <- tibble(conf_company = seq(1, 4, length.out = 100))
preds <- predict(mod1, type = "response", newdata = new_data, se.fit = TRUE)

# combine predictions with confidence intervals
new_data <- new_data %>%
  mutate(
    probs = as.numeric(preds),
    se = sqrt(attr(preds, "var")),
    lower = pmax(0, probs - 1.96 * se),
    upper = pmin(1, probs + 1.96 * se)
  )

Visualize models and results: survey package

ggplot() +
  # jittered raw data points
   geom_jitter(
    data = pew_updated, 
    aes(x = conf_company, y = censored_view),
    width = 0.3, height = 0.1, size = 0.1, alpha = 0.5
  ) +
  
  # predicted probability line
  geom_line(
    data = new_data, 
    aes(x = conf_company, y = probs), 
    color = "#2f6ead", size = 1
  ) +
  
  # confidence interval bands
  geom_ribbon(
    data = new_data, 
    aes(x = conf_company, ymin = lower, ymax = upper),
    fill = "#2f6ead", alpha = 0.2
  ) +
  
  # y-axis formatting
  scale_y_continuous(
    breaks = c(0, 0.5, 1), 
    limits = c(0, 1), 
    labels = scales::percent_format(accuracy = 1)
  ) +
  
  labs(
    x = "Confidence in Social Media Companies",
    y = "Probability of Believing Platforms Censor Views"
  ) +
  theme_bw(base_size = 12)

Visualize models and results: rstanarm package

# fit Bayesian logistic regression using rstanarm
mod2 <- stan_glm(
  censored_view ~ conf_company,
  family = binomial(link = "logit"),
  data = pew_updated
)

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 0.000384 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.84 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.184 seconds (Warm-up)
Chain 1:                1.479 seconds (Sampling)
Chain 1:                2.663 seconds (Total)
Chain 1: 

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
Chain 2: 
Chain 2: Gradient evaluation took 0.000344 seconds
Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 3.44 seconds.
Chain 2: Adjust your expectations accordingly!
Chain 2: 
Chain 2: 
Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 2: 
Chain 2:  Elapsed Time: 1.379 seconds (Warm-up)
Chain 2:                1.495 seconds (Sampling)
Chain 2:                2.874 seconds (Total)
Chain 2: 

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
Chain 3: 
Chain 3: Gradient evaluation took 0.000187 seconds
Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 1.87 seconds.
Chain 3: Adjust your expectations accordingly!
Chain 3: 
Chain 3: 
Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 3: 
Chain 3:  Elapsed Time: 1.177 seconds (Warm-up)
Chain 3:                1.442 seconds (Sampling)
Chain 3:                2.619 seconds (Total)
Chain 3: 

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
Chain 4: 
Chain 4: Gradient evaluation took 0.000179 seconds
Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 1.79 seconds.
Chain 4: Adjust your expectations accordingly!
Chain 4: 
Chain 4: 
Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 4: 
Chain 4:  Elapsed Time: 1.155 seconds (Warm-up)
Chain 4:                1.442 seconds (Sampling)
Chain 4:                2.597 seconds (Total)
Chain 4: 
# create a new dataset for prediction across the full range of conf_company
new_data <- tibble(conf_company = seq(1, 4, length.out = 100))

# get posterior predicted probabilities
preds2 <- posterior_epred(mod2, newdata = new_data)

# compute 95% credible intervals for each predicted value
new_data <- new_data %>%
  mutate(
    lower = apply(preds2, 2, quantile, probs = 0.025),
    upper = apply(preds2, 2, quantile, probs = 0.975),
    probs = colMeans(preds2)  # posterior mean prediction
  )

Visualize models and results: rstanarm package

ggplot() +
  geom_jitter(
    data = pew_updated,
    aes(x = conf_company, y = censored_view),
    width = 0.3, height = 0.1, size = 0.1, alpha = 0.5
  ) +

  geom_line(
    data = new_data,
    aes(x = conf_company, y = probs),
    color = "#2f6ead", size = 1
  ) +
  
  geom_ribbon(
    data = new_data,
    aes(x = conf_company, ymin = lower, ymax = upper),
    fill = "#2f6ead", alpha = 0.2
  ) +
  
  scale_y_continuous(
    breaks = c(0, 0.5, 1),
    labels = scales::percent_format(accuracy = 1),
    limits = c(0, 1)
  ) +
  
  labs(
    x = "Confidence in Social Media Companies",
    y = "Probability of Believing Platforms Censor Views"
  ) +
  
  theme_bw(base_size = 12)