Visualize descriptive patterns in the data
Bar plot and its extensions
Dot plot
Visualize fitted models, inferences, and final results
Frequentist estimation (survey package)
Bayesian estimation (rstanarm package)
Load the tidyverse package
dplyr: data wrangling
ggplot2: data visualization
stringr: string manipulation
Ingredients of a ggplot2 plot
data
aesthetics
geom objects
We will use the American Trends Panel Wave 99 (conducted Nov. 1-7, 2021) from Pew Research Center, which focuses on public attitudes on artificial intelligence (AI) and human enhancement.
Load packages
Load data
When using foreign package, categorical variables are imported as factors
When using haven package, categorical variables are imported as labelled class variables (which are sometimes difficult to wrangle)
# estimates of proportions for facial recognition technology
q1 <- svymean(~FACEREC2_W99,
design = pew_design,
na.rm = T)
q1
mean SE
FACEREC2_W99Good idea for society 0.4564492 0.0116
FACEREC2_W99Bad idea for society 0.2724515 0.0105
FACEREC2_W99Not sure 0.2679035 0.0100
FACEREC2_W99Refused 0.0031958 0.0013
# construct data for plotting
ai_society <- tibble(prop = round(c(q1[-4], q2[-4], q3[-4]), 2)*100,
type = rep(c("Facial recognition", "Algorithms", "Driverless vehicles"), each = 3),
response = rep(c("Good idea for society", "Bad idea for society", "Not sure"), times = 3))
# reorder variable levels
ai_society_reordered <- ai_society %>%
mutate(type = factor(type, levels = c("Driverless vehicles", "Algorithms", "Facial recognition")),
response = factor(response, levels = c("Not sure",
"Bad idea for society",
"Good idea for society")))
q1_data <- tibble(prop = round(q1[-4], 2) * 100,
cat = c("Good idea for society", "Bad idea for society", "Not sure"))
ggplot(q1_data, aes(x = cat, y = prop)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = paste0(prop, "%")), hjust = 0.5, vjust = -0.3) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
theme_bw() +
labs(x = "", y = "")
ai_society_reordered %>% # alternative: |>
ggplot(aes(x = type, y = prop, fill = response)) +
geom_bar(position = "stack",
stat = "identity", # use actual values in y
width = 0.4) +
geom_text(aes(label = prop, color = response),
position = position_stack(),
hjust = 1.1) + # hjust = 0.5 centers at the x position, hjust < 0.5 moves right, hjust > 0.5 moves left
coord_flip() + # flip the x and y coordinates (vertical bars become horizontal bars)
scale_fill_manual(values = c("Good idea for society" = "#2f6ead",
"Bad idea for society" = "#95a644",
"Not sure" = "#d6d6d6"),
name = "") + # https://imagecolorpicker.com/en to pick colors
scale_color_manual(values = c("Good idea for society" = "white",
"Bad idea for society"= "white",
"Not sure"= "black")) +
labs(y = "",
x = "",
title = str_wrap("The public is more open to applications of human enhancement and AI", 80),
subtitle = str_wrap("% of US adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a...", 100)) +
guides(fill = guide_legend(reverse = T), color = "none") + # reverse the legend order to match the color order in each bar
theme(legend.position = "top",
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
plot.title = element_text(size = 13, face = "bold", hjust = 0),
plot.subtitle = element_text(size = 10, color = "grey", hjust = 0, face = "italic"))
ggplot(ai_society_reordered, aes(x = type, y = prop, fill = response)) +
geom_bar(position = "stack",
stat = "identity",
width = 0.6) +
geom_text(aes(label = prop, color = response),
position = position_stack(),
hjust = 1.1) +
geom_vline(xintercept = c(1.5, 2.5), linetype = "dashed") + # add dashed lines separating questions
coord_flip() +
scale_fill_manual(values = c("Good idea for society" = "#2f6ead",
"Bad idea for society" = "#95a644",
"Not sure" = "#d6d6d6"),
name = "") +
scale_color_manual(values = c("Good idea for society" = "white",
"Bad idea for society"= "white",
"Not sure"= "black")) +
facet_wrap(~fct_rev(response)) + # split into three multiples based on response option
labs(y = "",
x = "",
title = str_wrap("The public is more open to applications of human enhancement and AI", 80),
subtitle = str_wrap("% of US adults who say the widespread use of each of the following artificial intelligence and human enhancement applications has been/would be a...", 100)) +
theme(legend.position = "none",
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
strip.background = element_blank(),
strip.text = element_text(face = "bold", hjust = 0),
plot.title = element_text(size = 13, face = "bold", hjust = 0),
plot.subtitle = element_text(size = 10, color = "grey", hjust = 0, face = "italic"))
mean
FACEREC7_W99Government will go too far regulating its use 0.471111
FACEREC7_W99Government will not go far enough regulating its use 0.509852
FACEREC7_W99Refused 0.019037
SE
FACEREC7_W99Government will go too far regulating its use 0.0116
FACEREC7_W99Government will not go far enough regulating its use 0.0116
FACEREC7_W99Refused 0.0030
q4.rep <- svymean(~FACEREC7_W99,
subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
na.rm = T)
q4.rep
mean
FACEREC7_W99Government will go too far regulating its use 0.586876
FACEREC7_W99Government will not go far enough regulating its use 0.396366
FACEREC7_W99Refused 0.016757
SE
FACEREC7_W99Government will go too far regulating its use 0.0167
FACEREC7_W99Government will not go far enough regulating its use 0.0167
FACEREC7_W99Refused 0.0034
q4.dem <- svymean(~FACEREC7_W99,
subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
na.rm = T)
q4.dem
mean
FACEREC7_W99Government will go too far regulating its use 0.364099
FACEREC7_W99Government will not go far enough regulating its use 0.619109
FACEREC7_W99Refused 0.016792
SE
FACEREC7_W99Government will go too far regulating its use 0.0159
FACEREC7_W99Government will not go far enough regulating its use 0.0160
FACEREC7_W99Refused 0.0046
fr_party <- tibble(type = c("U.S. adults", "Rep/lean Rep", "Dem/lean Dem"),
go_too_far = round(c(q4[1], q4.rep[1], q4.dem[1]), 2)*100,
not_far_enough = round(c(q4[2], q4.rep[2], q4.dem[2]), 2)*100)
# have values for one group as negatives
fr_party_long <- fr_party %>%
pivot_longer(cols = -type, names_to = "response", values_to = "prop") %>%
mutate(prop = ifelse(response == "not_far_enough", -1 * prop, prop)) %>%
mutate(response = factor(response, levels = c("not_far_enough", "go_too_far"))) %>%
mutate(type = factor(type, levels = c("Dem/lean Dem", "Rep/lean Rep", "U.S. adults")))
ggplot(fr_party_long, aes(x = type, y = prop, fill = response)) +
geom_col(position = "identity",
width = 0.6) +
theme(aspect.ratio = 0.5) + # adjust the ratio of height to width to make bars thinner without increasing their distance
coord_flip() +
labs(x = "",
y = "",
title = str_wrap("Similar shares say government will go too far or not far enough regulating police's facial recognition use, but this varies by party", 85),
subtitle = str_wrap("% of U.S. adults who say that if the use of facial recognition technology by police becomes widespread, their greater concern is that government will __ regulating its use", 105),
caption = "Note: Respondents who did not give an answer are not shown.\nSource: Survey conducted Nov.1-7, 2021.") +
geom_text(data = fr_party_long[fr_party_long$prop<0, ],
aes(label = prop*-1), position = position_stack(vjust = -0.1)) +
geom_text(data = fr_party_long[fr_party_long$prop>0, ],
aes(label = prop), position = position_stack(vjust = 1.1)) +
scale_fill_manual(values = c("not_far_enough" = "#95a644", "go_too_far" = "#2f6ead"),
labels = c("Not go far enough", "Go too far"),
name = "") +
guides(fill = guide_legend(keywidth = unit(0, "mm"), # remove legend key squares
keyheight = unit(0, "mm"),
direction = "horizontal")) +
theme(legend.position = c(0.3, 1), # x, y are between 0 and 1
legend.spacing.x = unit(15, "mm"),
legend.text = element_text(face = "bold", size = 12),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
plot.title.position = "plot", # move title to the leftmost position
plot.caption.position = "plot", # move subtitle to the leftmost position
plot.subtitle = element_text(size = 10, color = "grey60", hjust = 0, face = "italic", margin = margin(b=20)),
plot.caption = element_text(size = 10, color = "grey60", hjust = 0), # add graph notes
plot.margin = margin(b=10,t=10,l=10,r=10))
# news and information
q5.dem <- svymean(~I(SMALG4_a_W99 == "Definitely happening" | SMALG4_a_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
na.rm = T)
q5.rep <- svymean(~I(SMALG4_a_W99 == "Definitely happening" | SMALG4_a_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
na.rm = T)
# political viewpoints
q6.dem <- svymean(~I(SMALG4_b_W99 == "Definitely happening" | SMALG4_b_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
na.rm = T)
q6.rep <- svymean(~I(SMALG4_b_W99 == "Definitely happening" | SMALG4_b_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
na.rm = T)
# trustworthy information
q7.dem <- svymean(~I(SMALG4_c_W99 == "Definitely happening" | SMALG4_c_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
na.rm = T)
q7.rep <- svymean(~I(SMALG4_c_W99 == "Definitely happening" | SMALG4_c_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
na.rm = T)
# meaningful conversations
q8.dem <- svymean(~I(SMALG4_d_W99 == "Definitely happening" | SMALG4_d_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Dem/Lean Dem"),
na.rm = T)
q8.rep <- svymean(~I(SMALG4_d_W99 == "Definitely happening" | SMALG4_d_W99 == "Probably happening"),
subset(pew_design, F_PARTYSUM_FINAL == "Rep/Lean Rep"),
na.rm = T)
companies <- tibble(issue = c("Political viewpoints are\n being censored",
"News and information\n are being wrongly removed",
"It is getting easier to\n find trustworthy\n information",
"It is allowing people to\n have more meaningful\n conversations"),
dem = round(c(q6.dem[2], q5.dem[2], q7.dem[2], q8.dem[2]), 2) * 100,
rep = round(c(q6.rep[2], q5.rep[2], q7.rep[2], q8.rep[2]), 2) * 100,
diff = ifelse(rep - dem > 0,
paste0("+", as.character(rep-dem)),
as.character(rep-dem)))
subtitle <- "% of U.S. adults who say that due to widespread use of computer programs by social media companies to find <br> false information, each of the following is <b>definitely</b> or <b>probably happening</b> on the companies' sites"
ggplot(companies, aes(y = issue)) +
geom_segment(aes(x = dem, xend = rep, yend = issue), linetype = "solid", linewidth = 2, color = 'grey90') +
geom_point(aes(x = dem), color = 'royalblue4', size = 3) + # Blue dots for Dem/lean Dem
geom_point(aes(x = rep), color = 'tomato4', size = 3) + # Red dots for Rep/lean Rep
geom_text(aes(x = dem, label = dem), color = 'royalblue4', vjust =2) +
geom_text(aes(x = rep, label = rep), color = 'tomato4', vjust =2) +
geom_rect(aes(xmin=93, xmax = 103, ymin = 0.5, ymax = 5),
fill = "grey90",
alpha = 0.2) +
geom_text(aes(x = 96, label = diff), hjust = 0) +
labs(x = "",
y = "",
title = str_wrap("Majorities in both parties say social media companies using algorithms to find false information is leading to censorship, but Republicans far more likely to say so", 95),
subtitle = subtitle) +
annotate(geom = "text", label = str_wrap("Dem/lean Dem", 1), color = 'royalblue4', x = 58, y = 4.7) +
annotate(geom = "text", label = str_wrap("Rep/lean Rep", 1), color = 'tomato4', x = 86, y = 4.7) +
annotate(geom = "text", label = str_wrap("DIFF Rep-Dem", 2), x = 98, y = 4.7) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
plot.subtitle = element_markdown(size = 10, margin = margin(b=20), color = "grey60", face = "italic"),
plot.title.position = "plot")
censored_views <- tibble(party = c("dem", "rep"),
est = c(q6.dem[2], q6.rep[2])*100,
lower = c(confint(q6.dem)[2,1], confint(q6.rep)[2,1])*100,
upper = c(confint(q6.dem)[2,2], confint(q6.rep)[2,2])*100
)
ggplot(censored_views, aes(x = party, y = est, ymin = lower, ymax = upper)) +
geom_pointrange() + # we can also use geom_errorbar()
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
scale_x_discrete(labels = c("Dem/lean Dem", "Rep/lean Rep")) +
theme_bw() +
labs(x = "", y = "", title = "% of U.S. adults who believe the use of algorithms to detect false information\n leads to censorship")
Y: Whether an individual believes that social media companies’ use of algorithms to detect false information leads to censorship
X: Confidence in social media companies
# recode variables
pew_updated <- pew %>%
mutate(conf_company = case_when(SMALG7_W99 == "No confidence at all" ~ 1,
SMALG7_W99 == "Not too much confidence" ~ 2,
SMALG7_W99 == "A fair amount of confidence" ~ 3,
SMALG7_W99 == "A great deal of confidence" ~ 4),
censored_view = case_when(SMALG4_b_W99 == "Probably NOT happening" | SMALG4_b_W99 == "Definitely NOT happening" ~ 0,
SMALG4_b_W99 == "Probably happening" | SMALG4_b_W99 == "Definitely happening" ~ 1))
# update survey design with added variables
pew_design <- update(pew_design, conf_company = pew_updated$conf_company)
pew_design <- update(pew_design, censored_view = pew_updated$censored_view)
# define a logistic regression model
mod1 <- svyglm(censored_view ~ conf_company, family = quasibinomial(link = "logit"), design = pew_design)
summary(mod1)
Call:
svyglm(formula = censored_view ~ conf_company, design = pew_design,
family = quasibinomial(link = "logit"))
Survey design:
update(pew_design, censored_view = pew_updated$censored_view)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.39690 0.14292 9.774 < 0.0000000000000002 ***
conf_company -0.23701 0.06647 -3.566 0.000366 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for quasibinomial family taken to be 0.9888884)
Number of Fisher Scoring iterations: 4
# compute predicted probabilities
new_data <- tibble(conf_company = seq(1, 4, length.out = 100))
preds <- predict(mod1, type = "response", newdata = new_data, se.fit = T) %>% as.tibble()
new_data <- new_data %>%
mutate(probs = preds$response,
lower = preds$response - 1.96 * preds$SE,
upper = preds$response + 1.96 * preds$SE)
ggplot() +
# plot raw data
geom_jitter(data = pew_updated,
aes(x = conf_company, y = censored_view),
size = 1/10, width = 0.3 , height = 0.1, alpha=0.5) +
# plot fitted model
geom_line(data=new_data, aes(x = conf_company, y = probs), color = "blue", size = 1) +
# plot confidence interval bands
geom_ribbon(data=new_data, aes(x = conf_company, ymin = lower, ymax = upper), alpha = 0.2) +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
labs(x = "Confidence in social media companies", y = "Probability of perceived censorship") +
theme_bw()
mod2 <- stan_glm(censored_view ~ conf_company, family = binomial(link = "logit"), data = pew_updated)
new_data <- tibble(conf_company = seq(1, 4, length.out=100))
preds2 <- posterior_epred(mod2, newdata = new_data)
# Compute credible intervals
ci_lower <- apply(preds2, 2, quantile, probs = 0.025)
ci_upper <- apply(preds2, 2, quantile, probs = 0.975)
new_data <- new_data %>%
mutate(lower = ci_lower, upper = ci_upper, probs = (ci_lower+ci_upper)/2)
ggplot() +
geom_jitter(data=pew_updated, aes(x = conf_company, y = censored_view), size=1/10, width=0.3 , height = 0.1,alpha=0.5) +
geom_line(data=new_data, aes(x = conf_company, y = probs), color = "blue", size = 1) +
geom_ribbon(data=new_data, aes(x = conf_company, ymin = lower, ymax = upper), alpha = 0.2) +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
labs(x = "Confidence in social media companies", y = "Probability of perceived censorship") +
theme_bw()
Survey and Polling Methodology