Visualizing Survey Data with R

Zhaowen Guo

Agenda

  • Visualize descriptive patterns in the data

    • Bar plot and its extensions

    • Dot plot

  • Visualize fitted models, inferences, and final results

    • Frequentist estimation (survey package)

    • Bayesian estimation (rstanarm package)

Tidyverse approach

  • Organize your data in a tidy format
  • Load the tidyverse package

    • dplyr: data wrangling

    • ggplot2: data visualization

    • stringr: string manipulation

  • Ingredients of a ggplot2 plot

    • data

    • aesthetics

    • geom objects

An illustration

We will use the American Trends Panel Wave 99 (conducted Nov. 1-7, 2021) from Pew Research Center, which focuses on public attitudes on artificial intelligence (AI) and human enhancement.

Data preparation

Load packages

options(scipen = 999) # prevent scientific notation like e
library(foreign) # read sav files
library(survey) # survey data analysis with complex designs
library(rstanarm) # bayesian applied regression modeling
library(ggtext) # highlight text
library(tidyverse) # data wrangling and visualization

Load data

pew <- foreign::read.spss("ATP W99.sav", to.data.frame = T)
# alternative: pew <- haven::read_sav("ATP W99.sav")
  • When using foreign package, categorical variables are imported as factors

  • When using haven package, categorical variables are imported as labelled class variables (which are sometimes difficult to wrangle)

    • Sometimes we need to convert labelled variables into factors via haven::as_factor()

Specify survey design

pew_design <- svydesign(ids = ~0, # ~0 or ~1 indicates no clusters
                        weights = ~WEIGHT_W99, # calibration weights
                        data = pew)

Visualize descriptive patterns: bar plot

# estimates of proportions for facial recognition technology
q1 <- svymean(~FACEREC2_W99, 
              design = pew_design,
              na.rm = T)
q1
                                       mean     SE
FACEREC2_W99Good idea for society 0.4564492 0.0116
FACEREC2_W99Bad idea for society  0.2724515 0.0105
FACEREC2_W99Not sure              0.2679035 0.0100
FACEREC2_W99Refused               0.0031958 0.0013
# estimates of proportions for algorithms
q2 <- svymean(~SMALG2_W99, 
              design = pew_design,
              na.rm = T)
# estimates of proportions for driverless passenger vehicles
q3 <- svymean(~DCARS2_W99, 
              design = pew_design,
              na.rm = T)
# construct data for plotting
ai_society <- tibble(prop = round(c(q1[-4], q2[-4], q3[-4]), 2)*100,
                     type = rep(c("Facial recognition", "Algorithms", "Driverless vehicles"), each = 3),
                     response = rep(c("Good idea for society", "Bad idea for society", "Not sure"), times = 3)) 
# reorder variable levels
ai_society_reordered <- ai_society %>%
  mutate(type = factor(type, levels = c("Driverless vehicles", "Algorithms", "Facial recognition")),
         response = factor(response, levels = c("Not sure",
                                                "Bad idea for society",
                                                "Good idea for society")))
q1_data <- tibble(prop = round(q1[-4], 2) * 100,
                  cat = c("Good idea for society", "Bad idea for society", "Not sure"))

ggplot(q1_data, aes(x = cat, y = prop)) +
  geom_bar(stat = "identity", width = 0.5) + 
  geom_text(aes(label = paste0(prop, "%")), hjust = 0.5, vjust = -0.3) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme_bw() + 
  labs(x = "", y = "")

ai_society_reordered %>% # alternative: |>
  ggplot(aes(x = type, y = prop, fill = response)) +
  geom_bar(position = "stack",
           stat = "identity", # use actual values in y 
           width = 0.4) +
  geom_text(aes(label = prop, color = response), 
            position = position_stack(),
            hjust = 1.1) + # hjust = 0.5 centers at the x position, hjust < 0.5 moves right, hjust > 0.5 moves left
  coord_flip() + # flip the x and y coordinates (vertical bars become horizontal bars)
  scale_fill_manual(values = c("Good idea for society" = "#2f6ead", 
                               "Bad idea for society" = "#95a644", 
                               "Not sure" = "#d6d6d6"), 
                    name = "") + # https://imagecolorpicker.com/en to pick colors
  scale_color_manual(values = c("Good idea for society" = "white", 
                                "Bad idea for society"= "white", 
                                "Not sure"= "black")) +
  labs(y = "", 
       x = "", 
       title = str_wrap("The public is more open to applications of human enhancement and AI", 80),
       subtitle = str_wrap("% of US adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a...", 100)) +
  guides(fill = guide_legend(reverse = T), color = "none") + # reverse the legend order to match the color order in each bar
  theme(legend.position = "top",
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        plot.title = element_text(size = 13, face = "bold", hjust = 0),
        plot.subtitle = element_text(size = 10, color = "grey", hjust = 0, face = "italic"))

ggplot(ai_society_reordered, aes(x = type, y = prop, fill = response)) + 
  geom_bar(position = "stack",
           stat = "identity",
           width = 0.6) +
  geom_text(aes(label = prop, color = response), 
            position = position_stack(),
            hjust = 1.1) +
  geom_vline(xintercept = c(1.5, 2.5), linetype = "dashed") + # add dashed lines separating questions
  coord_flip() +
  scale_fill_manual(values = c("Good idea for society" = "#2f6ead", 
                               "Bad idea for society" = "#95a644", 
                               "Not sure" = "#d6d6d6"), 
                    name = "") +
  scale_color_manual(values = c("Good idea for society" = "white", 
                                "Bad idea for society"= "white", 
                                "Not sure"= "black")) +
  facet_wrap(~fct_rev(response)) + # split into three multiples based on response option
  labs(y = "", 
       x = "", 
       title = str_wrap("The public is more open to applications of human enhancement and AI", 80),
       subtitle = str_wrap("% of US adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a...", 100)) +
  theme(legend.position = "none",
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(face = "bold", hjust = 0),
        plot.title = element_text(size = 13, face = "bold", hjust = 0),
        plot.subtitle = element_text(size = 10, color = "grey", hjust = 0, face = "italic"))

Visualize descriptive patterns: bar plot

q4 <- svymean(~FACEREC7_W99, 
              design = pew_design,
              na.rm = T)
q4
                                                                     mean
FACEREC7_W99Government will go too far regulating its use        0.471111
FACEREC7_W99Government will not go far enough regulating its use 0.509852
FACEREC7_W99Refused                                              0.019037
                                                                     SE
FACEREC7_W99Government will go too far regulating its use        0.0116
FACEREC7_W99Government will not go far enough regulating its use 0.0116
FACEREC7_W99Refused                                              0.0030
q4.rep <- svymean(~FACEREC7_W99, 
                  subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
                  na.rm = T)
q4.rep
                                                                     mean
FACEREC7_W99Government will go too far regulating its use        0.586876
FACEREC7_W99Government will not go far enough regulating its use 0.396366
FACEREC7_W99Refused                                              0.016757
                                                                     SE
FACEREC7_W99Government will go too far regulating its use        0.0167
FACEREC7_W99Government will not go far enough regulating its use 0.0167
FACEREC7_W99Refused                                              0.0034
q4.dem <- svymean(~FACEREC7_W99, 
                  subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
                  na.rm = T)
q4.dem
                                                                     mean
FACEREC7_W99Government will go too far regulating its use        0.364099
FACEREC7_W99Government will not go far enough regulating its use 0.619109
FACEREC7_W99Refused                                              0.016792
                                                                     SE
FACEREC7_W99Government will go too far regulating its use        0.0159
FACEREC7_W99Government will not go far enough regulating its use 0.0160
FACEREC7_W99Refused                                              0.0046
fr_party <- tibble(type = c("U.S. adults", "Rep/lean Rep", "Dem/lean Dem"),
                   go_too_far = round(c(q4[1], q4.rep[1], q4.dem[1]), 2)*100,
                   not_far_enough = round(c(q4[2], q4.rep[2], q4.dem[2]), 2)*100)

# have values for one group as negatives
fr_party_long <- fr_party %>%
  pivot_longer(cols = -type, names_to = "response", values_to = "prop") %>%
  mutate(prop = ifelse(response == "not_far_enough", -1 * prop, prop)) %>%
  mutate(response = factor(response, levels = c("not_far_enough", "go_too_far"))) %>%
  mutate(type = factor(type, levels = c("Dem/lean Dem", "Rep/lean Rep", "U.S. adults")))
ggplot(fr_party_long, aes(x = type, y = prop, fill = response)) +
  geom_col(position = "identity",
           width = 0.6) +
  theme(aspect.ratio = 0.5) + # adjust the ratio of height to width to make bars thinner without increasing their distance
  coord_flip() +  
  labs(x = "", 
       y = "",
       title = str_wrap("Similar shares say government will go too far or not far enough regulating police's facial recognition use, but this varies by party", 85),
       subtitle = str_wrap("% of U.S. adults who say that if the use of facial recognition technology by police becomes widespread, their greater concern is that government will __ regulating its use", 105),
       caption = "Note: Respondents who did not give an answer are not shown.\nSource: Survey conducted Nov.1-7, 2021.") +
  geom_text(data = fr_party_long[fr_party_long$prop<0, ],
            aes(label = prop*-1), position = position_stack(vjust = -0.1)) +
  geom_text(data = fr_party_long[fr_party_long$prop>0, ],
            aes(label = prop), position = position_stack(vjust = 1.1)) +
  scale_fill_manual(values = c("not_far_enough" = "#95a644", "go_too_far" = "#2f6ead"),
                    labels = c("Not go far enough", "Go too far"),
                    name = "") +
  guides(fill = guide_legend(keywidth = unit(0, "mm"),  # remove legend key squares
                             keyheight = unit(0, "mm"),
                             direction = "horizontal")) +
  theme(legend.position = c(0.3, 1), # x, y are between 0 and 1
        legend.spacing.x = unit(15, "mm"),
        legend.text = element_text(face = "bold", size = 12),
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        plot.title.position = "plot", # move title to the leftmost position
        plot.caption.position = "plot", # move subtitle to the leftmost position
        plot.subtitle = element_text(size = 10, color = "grey60", hjust = 0, face = "italic", margin = margin(b=20)),
        plot.caption = element_text(size = 10, color = "grey60", hjust = 0), # add graph notes
        plot.margin = margin(b=10,t=10,l=10,r=10))

Visualize descriptive patterns: dot plot

# news and information
q5.dem <- svymean(~I(SMALG4_a_W99 == "Definitely happening" | SMALG4_a_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
                  na.rm = T)
q5.rep <- svymean(~I(SMALG4_a_W99 == "Definitely happening" | SMALG4_a_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
                  na.rm = T)

# political viewpoints
q6.dem <- svymean(~I(SMALG4_b_W99 == "Definitely happening" | SMALG4_b_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
                  na.rm = T)
q6.rep <- svymean(~I(SMALG4_b_W99 == "Definitely happening" | SMALG4_b_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
                  na.rm = T)

# trustworthy information
q7.dem <- svymean(~I(SMALG4_c_W99 == "Definitely happening" | SMALG4_c_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
                  na.rm = T)
q7.rep <- svymean(~I(SMALG4_c_W99 == "Definitely happening" | SMALG4_c_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
                  na.rm = T)

# meaningful conversations
q8.dem <- svymean(~I(SMALG4_d_W99 == "Definitely happening" | SMALG4_d_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
                  na.rm = T)
q8.rep <- svymean(~I(SMALG4_d_W99 == "Definitely happening" | SMALG4_d_W99 == "Probably happening"),
                  subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
                  na.rm = T)
companies <- tibble(issue = c("Political viewpoints are\n being censored", 
                              "News and information\n are being wrongly removed", 
                              "It is getting easier to\n find trustworthy\n information", 
                              "It is allowing people to\n have more meaningful\n conversations"),
                    dem = round(c(q6.dem[2], q5.dem[2], q7.dem[2], q8.dem[2]), 2) * 100,
                    rep = round(c(q6.rep[2], q5.rep[2], q7.rep[2], q8.rep[2]), 2) * 100,
                    diff = ifelse(rep - dem > 0, 
                                  paste0("+", as.character(rep-dem)), 
                                  as.character(rep-dem)))
subtitle <- "% of U.S. adults who say that due to widespread use of computer programs by social media companies to find <br> false information, each of the following is <b>definitely</b> or <b>probably happening</b> on the companies' sites"

ggplot(companies, aes(y = issue)) +
  geom_segment(aes(x = dem, xend = rep, yend = issue), linetype = "solid", linewidth = 2, color = 'grey90') +
  geom_point(aes(x = dem), color = 'royalblue4', size = 3) +  # Blue dots for Dem/lean Dem
  geom_point(aes(x = rep), color = 'tomato4', size = 3) +   # Red dots for Rep/lean Rep
  geom_text(aes(x = dem, label = dem), color = 'royalblue4', vjust =2) +
  geom_text(aes(x = rep, label = rep), color = 'tomato4', vjust =2) +
  geom_rect(aes(xmin=93, xmax = 103, ymin = 0.5, ymax = 5),
            fill = "grey90", 
            alpha = 0.2) +
  geom_text(aes(x = 96, label = diff), hjust = 0) +
  labs(x = "", 
       y = "",
       title = str_wrap("Majorities in both parties say social media companies using algorithms to find false information is leading to censorship, but Republicans far more likely to say so", 95),
       subtitle = subtitle) +
  annotate(geom = "text", label = str_wrap("Dem/lean Dem", 1), color = 'royalblue4', x = 58, y = 4.7) +
  annotate(geom = "text", label = str_wrap("Rep/lean Rep", 1), color = 'tomato4', x = 86, y = 4.7) +
  annotate(geom = "text", label = str_wrap("DIFF Rep-Dem", 2), x = 98, y = 4.7) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        plot.subtitle = element_markdown(size = 10, margin = margin(b=20), color = "grey60", face = "italic"),
        plot.title.position = "plot")

censored_views <- tibble(party = c("dem", "rep"),
                         est = c(q6.dem[2], q6.rep[2])*100,
                         lower = c(confint(q6.dem)[2,1], confint(q6.rep)[2,1])*100,
                         upper = c(confint(q6.dem)[2,2], confint(q6.rep)[2,2])*100
)

ggplot(censored_views, aes(x = party, y = est, ymin = lower, ymax = upper)) +
  geom_pointrange() + # we can also use geom_errorbar()
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  scale_x_discrete(labels = c("Dem/lean Dem", "Rep/lean Rep")) +
  theme_bw() +
  labs(x = "", y = "", title = "% of U.S. adults who believe the use of algorithms to detect false information\n leads to censorship")

Visualize models and results: survey package

Y: Whether an individual believes that social media companies’ use of algorithms to detect false information leads to censorship

X: Confidence in social media companies

# recode variables
pew_updated <- pew %>%
  mutate(conf_company = case_when(SMALG7_W99 == "No confidence at all" ~ 1,
                                  SMALG7_W99 == "Not too much confidence" ~ 2,
                                  SMALG7_W99 == "A fair amount of confidence" ~ 3,
                                  SMALG7_W99 == "A great deal of confidence" ~ 4),
         censored_view = case_when(SMALG4_b_W99 == "Probably NOT happening" | SMALG4_b_W99 == "Definitely NOT happening" ~ 0, 
                                   SMALG4_b_W99 == "Probably happening" | SMALG4_b_W99 == "Definitely happening" ~ 1))

# update survey design with added variables
pew_design <- update(pew_design, conf_company = pew_updated$conf_company)
pew_design <- update(pew_design, censored_view = pew_updated$censored_view)

# define a logistic regression model
mod1 <- svyglm(censored_view ~ conf_company, family = quasibinomial(link = "logit"), design = pew_design)
summary(mod1)

Call:
svyglm(formula = censored_view ~ conf_company, design = pew_design, 
    family = quasibinomial(link = "logit"))

Survey design:
update(pew_design, censored_view = pew_updated$censored_view)

Coefficients:
             Estimate Std. Error t value             Pr(>|t|)    
(Intercept)   1.39690    0.14292   9.774 < 0.0000000000000002 ***
conf_company -0.23701    0.06647  -3.566             0.000366 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasibinomial family taken to be 0.9888884)

Number of Fisher Scoring iterations: 4
# compute predicted probabilities
new_data <- tibble(conf_company = seq(1, 4, length.out = 100))
preds <- predict(mod1, type = "response", newdata = new_data, se.fit = T) %>% as.tibble() 
new_data <- new_data %>%
  mutate(probs = preds$response,
         lower = preds$response - 1.96 * preds$SE,
         upper = preds$response + 1.96 * preds$SE)

Visualize models and results: survey package

ggplot() +
  # plot raw data
  geom_jitter(data = pew_updated, 
              aes(x = conf_company, y = censored_view), 
              size = 1/10, width = 0.3 , height = 0.1, alpha=0.5) +
  # plot fitted model
  geom_line(data=new_data, aes(x = conf_company, y = probs), color = "blue", size = 1) +
  # plot confidence interval bands
  geom_ribbon(data=new_data, aes(x = conf_company, ymin = lower, ymax = upper), alpha = 0.2) +
  scale_y_continuous(breaks = c(0, 0.5, 1)) +
  labs(x = "Confidence in social media companies", y = "Probability of perceived censorship") +
  theme_bw()

Visualize models and results: rstanarm package

mod2 <- stan_glm(censored_view ~ conf_company, family = binomial(link = "logit"), data = pew_updated)
new_data <- tibble(conf_company = seq(1, 4, length.out=100))
preds2 <- posterior_epred(mod2, newdata = new_data)
# Compute credible intervals
ci_lower <- apply(preds2, 2, quantile, probs = 0.025)
ci_upper <- apply(preds2, 2, quantile, probs = 0.975)
new_data <- new_data %>%
  mutate(lower = ci_lower, upper = ci_upper, probs = (ci_lower+ci_upper)/2)

Visualize models and results: rstanarm package

ggplot() +
  geom_jitter(data=pew_updated, aes(x = conf_company, y = censored_view), size=1/10, width=0.3 , height = 0.1,alpha=0.5) +
  geom_line(data=new_data, aes(x = conf_company, y = probs), color = "blue", size = 1) +
  geom_ribbon(data=new_data, aes(x = conf_company, ymin = lower, ymax = upper), alpha = 0.2) +
  scale_y_continuous(breaks = c(0, 0.5, 1)) +
  labs(x = "Confidence in social media companies", y = "Probability of perceived censorship") +
  theme_bw()