Visualize descriptive patterns in the data
Visualize fitted models, inferences, and final results
Frequentist estimation (survey package)
Bayesian estimation (rstanarm package)
Load the tidyverse package
dplyr: data wrangling
ggplot2: data visualization
stringr: string manipulation
Ingredients of a ggplot2 plot
data: tidy dataset
aesthetics: how variables map to visual properties (x
, y
, color
, etc.)
geom objects: type of plot you’re making (geom_bar()
, geom_point()
, etc.)
We will use the American Trends Panel Wave 99 (conducted Nov. 1-7, 2021) from Pew Research Center, which focuses on public attitudes on artificial intelligence (AI) and human enhancement.
Load packages
Load data
When using foreign package, categorical variables are imported as factors
When using haven package, categorical variables are imported as labelled class variables (which are sometimes difficult to wrangle)
haven::as_factor()
Inspect data
Good idea for society Bad idea for society Not sure
2404 1373 1362
Refused
14
pew_clean <- pew %>%
filter(!is.na(FACEREC2_W99), FACEREC2_W99 != "Refused") %>%
droplevels()
prop.table(table(pew_clean$FACEREC2_W99))
Good idea for society Bad idea for society Not sure
0.4677953 0.2671726 0.2650321
ggplot(pew_clean, aes(x = FACEREC2_W99)) +
geom_bar(fill = "grey60") +
labs(
title = "Public Opinion on Police Use of Facial Recognition Technology",
subtitle = "Pew American Trends Panel, Wave 99 (Nov. 2021)",
x = "",
y = "Number of Respondents"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
pew_clean_prop <- pew_clean %>%
count(FACEREC2_W99) %>%
mutate(prop = n / sum(n))
ggplot(pew_clean_prop, aes(x = FACEREC2_W99, y = prop)) +
geom_col(fill = "grey60") +
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = scales::percent(prop, accuracy = 1)),
vjust = -0.3, size = 3) +
labs(
title = "Public Opinion on Police Use of Facial Recognition Technology",
subtitle = "Pew American Trends Panel, Wave 99 (Nov. 2021)",
x = "",
y = "Proportion of Respondents"
) +
theme_minimal()
Republican Democrat Independent Something else Refused
3174 3331 2717 937 101
pew_clean <- pew_clean %>%
filter(!F_PARTY_FINAL %in% c("Something else", "Refused")) %>%
droplevels()
party_colors <- c(
"Democrat" = "#4F81BD",
"Republican" = "#C0504D",
"Independent" = "#9BBB59"
)
ggplot(pew_clean, aes(x = FACEREC2_W99, fill = F_PARTY_FINAL)) +
geom_bar(position = "dodge") +
scale_fill_manual(values = party_colors) +
labs(
title = "Public Opinion on Police Use of Facial Recognition Technology by Partisanship",
x = "",
y = "Number of Respondents",
fill = "Partisanship"
) +
theme_minimal()
A man A woman In some other way Refused
4549 5628 60 23
pew_clean <- pew_clean %>%
filter(!F_GENDER %in% c("In some other way", "Refused")) %>%
droplevels() %>%
mutate(F_GENDER = forcats::fct_recode(F_GENDER,
"Female" = "A woman",
"Male" = "A man"
))
ggplot(pew_clean, aes(x = FACEREC2_W99)) +
geom_bar(fill = "grey60", width = 0.5) +
facet_wrap(~ F_GENDER, ncol = 1) +
labs(
title = "Public Opinion on Police Use of Facial Recognition Technology by Gender",
x = "",
y = "Number of Respondents"
) +
theme_minimal()
# estimate proportions for three AI-related questions using survey design
pew_design <- svydesign(ids = ~0, # ~0 or ~1 indicates no clusters
weights = ~WEIGHT_W99, # calibration weights
data = pew)
q1 <- svymean(~FACEREC2_W99, # facial recognition
design = pew_design,
na.rm = T)
q2 <- svymean(~SMALG2_W99, # algorithms
design = pew_design,
na.rm = T)
q3 <- svymean(~DCARS2_W99, # driverless passenger vehicles
design = pew_design,
na.rm = T)
# combine and label proportion estimates for plotting
ai_society <- tibble(
prop = round(c(q1[-4], q2[-4], q3[-4]) * 100, 0),
type = rep(c("Facial recognition", "Algorithms", "Driverless vehicles"), each = 3),
response = rep(c("Good idea for society", "Bad idea for society", "Not sure"), times = 3)
) %>%
mutate(
type = factor(type, levels = c("Driverless vehicles", "Algorithms", "Facial recognition")),
response = factor(response, levels = c("Not sure", "Bad idea for society", "Good idea for society"))
)
ai_society %>%
ggplot(aes(x = type, y = prop, fill = response)) +
geom_col(width = 0.4) +
# add percentage labels inside bars
geom_text(
aes(label = prop, color = response),
position = position_stack(vjust = 0.5),
size = 3.5
) +
# flip to horizontal bars
coord_flip() +
# custom fill and text color
scale_fill_manual(
values = c(
"Good idea for society" = "#2f6ead",
"Bad idea for society" = "#95a644",
"Not sure" = "#d6d6d6"
),
name = ""
) +
scale_color_manual(
values = c(
"Good idea for society" = "white",
"Bad idea for society" = "white",
"Not sure" = "black"
)
) +
# titles and labels
labs(
title = str_wrap("The public is more open to applications of human enhancement and AI", 80),
subtitle = str_wrap("% of U.S. adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a...", 100),
x = NULL,
y = NULL
) +
# reverse legend order to match stacking order
guides(fill = guide_legend(reverse = TRUE), color = "none") +
# clean minimal theme
theme_minimal(base_size = 11) +
theme(
legend.position = "top",
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
plot.title = element_text(size = 13, face = "bold", hjust = 0),
plot.subtitle = element_text(size = 10, color = "grey40", hjust = 0, face = "italic")
)
ggplot(ai_society, aes(x = type, y = prop, fill = response)) +
geom_col(width = 0.4) +
coord_flip() +
# dashed lines below each category label
geom_vline(xintercept = c(1.5, 2.5), linetype = "dashed", color = "grey60", linewidth = 0.4) +
# label inside bars, right aligned
geom_text(
aes(label = prop, color = response),
position = position_stack(vjust = 0.5),
hjust = 1.05,
size = 4
) +
# facet by response to mimic Pew's column layout
facet_wrap(~ fct_rev(response), nrow = 1) +
# custom fill and label colors (matching Pew palette)
scale_fill_manual(
values = c(
"Good idea for society" = "#2f6ead",
"Bad idea for society" = "#95a644",
"Not sure" = "#d6d6d6"
),
name = NULL
) +
scale_color_manual(
values = c(
"Good idea for society" = "white",
"Bad idea for society" = "white",
"Not sure" = "black"
)
) +
# axis and title formatting
labs(
title = str_wrap("Majority says brain chip implants for improved cognitive abilities would be bad idea for society; public more open to other applications of human enhancement and AI", 100),
subtitle = str_wrap("% of U.S. adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a ...", 120),
x = NULL,
y = NULL
) +
theme_minimal(base_size = 12) +
theme(
panel.grid = element_blank(),
axis.text.x = element_blank(),
axis.ticks = element_blank(),
legend.position = "none",
strip.background = element_blank(),
strip.text = element_text(face = "bold", size = 11, hjust = 0.5),
plot.title = element_text(size = 13, face = "bold", hjust = 0),
plot.subtitle = element_text(size = 10, color = "grey30", hjust = 0, face = "italic")
)
Y: Whether an individual believes that social media companies’ use of algorithms to detect false information leads to censorship
X: Confidence in social media companies
# recode variables
pew_updated <- pew %>%
mutate(
conf_company = case_when(
SMALG7_W99 == "No confidence at all" ~ 1,
SMALG7_W99 == "Not too much confidence" ~ 2,
SMALG7_W99 == "A fair amount of confidence" ~ 3,
SMALG7_W99 == "A great deal of confidence" ~ 4
),
censored_view = case_when(
SMALG4_b_W99 %in% c("Probably NOT happening", "Definitely NOT happening") ~ 0,
SMALG4_b_W99 %in% c("Probably happening", "Definitely happening") ~ 1
)
)
# update survey design with added variables
pew_design <- update(pew_design,
conf_company = pew_updated$conf_company,
censored_view = pew_updated$censored_view)
# define a logistic regression model
mod1 <- svyglm(censored_view ~ conf_company,
family = quasibinomial(link = "logit"),
design = pew_design)
summary(mod1)
Call:
svyglm(formula = censored_view ~ conf_company, design = pew_design,
family = quasibinomial(link = "logit"))
Survey design:
update(pew_design, conf_company = pew_updated$conf_company, censored_view = pew_updated$censored_view)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.39690 0.14292 9.774 < 0.0000000000000002 ***
conf_company -0.23701 0.06647 -3.566 0.000366 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for quasibinomial family taken to be 0.9888884)
Number of Fisher Scoring iterations: 4
# compute predicted probabilities
new_data <- tibble(conf_company = seq(1, 4, length.out = 100))
preds <- predict(mod1, type = "response", newdata = new_data, se.fit = TRUE)
# combine predictions with confidence intervals
new_data <- new_data %>%
mutate(
probs = as.numeric(preds),
se = sqrt(attr(preds, "var")),
lower = pmax(0, probs - 1.96 * se),
upper = pmin(1, probs + 1.96 * se)
)
ggplot() +
# jittered raw data points
geom_jitter(
data = pew_updated,
aes(x = conf_company, y = censored_view),
width = 0.3, height = 0.1, size = 0.1, alpha = 0.5
) +
# predicted probability line
geom_line(
data = new_data,
aes(x = conf_company, y = probs),
color = "#2f6ead", size = 1
) +
# confidence interval bands
geom_ribbon(
data = new_data,
aes(x = conf_company, ymin = lower, ymax = upper),
fill = "#2f6ead", alpha = 0.2
) +
# y-axis formatting
scale_y_continuous(
breaks = c(0, 0.5, 1),
limits = c(0, 1),
labels = scales::percent_format(accuracy = 1)
) +
labs(
x = "Confidence in Social Media Companies",
y = "Probability of Believing Platforms Censor Views"
) +
theme_bw(base_size = 12)
# fit Bayesian logistic regression using rstanarm
mod2 <- stan_glm(
censored_view ~ conf_company,
family = binomial(link = "logit"),
data = pew_updated
)
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 0.000384 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.84 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1:
Chain 1: Elapsed Time: 1.184 seconds (Warm-up)
Chain 1: 1.479 seconds (Sampling)
Chain 1: 2.663 seconds (Total)
Chain 1:
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
Chain 2:
Chain 2: Gradient evaluation took 0.000344 seconds
Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 3.44 seconds.
Chain 2: Adjust your expectations accordingly!
Chain 2:
Chain 2:
Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2:
Chain 2: Elapsed Time: 1.379 seconds (Warm-up)
Chain 2: 1.495 seconds (Sampling)
Chain 2: 2.874 seconds (Total)
Chain 2:
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
Chain 3:
Chain 3: Gradient evaluation took 0.000187 seconds
Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 1.87 seconds.
Chain 3: Adjust your expectations accordingly!
Chain 3:
Chain 3:
Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3:
Chain 3: Elapsed Time: 1.177 seconds (Warm-up)
Chain 3: 1.442 seconds (Sampling)
Chain 3: 2.619 seconds (Total)
Chain 3:
SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
Chain 4:
Chain 4: Gradient evaluation took 0.000179 seconds
Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 1.79 seconds.
Chain 4: Adjust your expectations accordingly!
Chain 4:
Chain 4:
Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4:
Chain 4: Elapsed Time: 1.155 seconds (Warm-up)
Chain 4: 1.442 seconds (Sampling)
Chain 4: 2.597 seconds (Total)
Chain 4:
# create a new dataset for prediction across the full range of conf_company
new_data <- tibble(conf_company = seq(1, 4, length.out = 100))
# get posterior predicted probabilities
preds2 <- posterior_epred(mod2, newdata = new_data)
# compute 95% credible intervals for each predicted value
new_data <- new_data %>%
mutate(
lower = apply(preds2, 2, quantile, probs = 0.025),
upper = apply(preds2, 2, quantile, probs = 0.975),
probs = colMeans(preds2) # posterior mean prediction
)
ggplot() +
geom_jitter(
data = pew_updated,
aes(x = conf_company, y = censored_view),
width = 0.3, height = 0.1, size = 0.1, alpha = 0.5
) +
geom_line(
data = new_data,
aes(x = conf_company, y = probs),
color = "#2f6ead", size = 1
) +
geom_ribbon(
data = new_data,
aes(x = conf_company, ymin = lower, ymax = upper),
fill = "#2f6ead", alpha = 0.2
) +
scale_y_continuous(
breaks = c(0, 0.5, 1),
labels = scales::percent_format(accuracy = 1),
limits = c(0, 1)
) +
labs(
x = "Confidence in Social Media Companies",
y = "Probability of Believing Platforms Censor Views"
) +
theme_bw(base_size = 12)
Survey and Polling Methodology