Setting custom labels on stacked ggplot2 plot - r

This question is a continuation of my previous question here.
I have a heatmap with a dataset available. The dataset is pasted below:
library(ggplot2)
library(colorspace)
library(directlabels)
smalltest <- structure(list(x = c(-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8,
-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7,
-6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7, -6,
-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7, -6, -5,
-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8),
y = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5),
z = c(0.353727812855041, 0.512450876310741,
0.668920303216554, 0.770942367746301, 0.829915628317595, 0.873001682466956,
0.900219155289838, 0.918353789896507, 0.936488424503176, 0.954641684205298,
0.961439844045867, 0.972770110446816, 0.975042372092157, 0.981846740297877,
0.986385055223408, 0.986385055223408, 0.986385055223408, 0.33104865495769,
0.464820298870698, 0.62128351741136, 0.752801524774481, 0.804964208774903,
0.850322524569605, 0.879812259037828, 0.913821683336127, 0.934222371222986,
0.950109577644919, 0.959173790765678, 0.970504057166626, 0.975042372092157,
0.981846740297877, 0.986385055223408, 0.986385055223408, 0.986385055223408,
0.31064175870568, 0.428544821292209, 0.589558771488704, 0.725596468681902,
0.786835782533385, 0.838986049803505, 0.872995474101805, 0.897946893644497,
0.920613634811545, 0.943299001074047, 0.956907737485488, 0.970504057166626,
0.970504057166626, 0.981840531932726, 0.986385055223408, 0.986385055223408,
0.986385055223408, 0.29023486245367, 0.419493024901753, 0.569145666871543,
0.702929727514853, 0.775480682671832, 0.827655783402557, 0.866197314261236,
0.891148733803927, 0.916075319886014, 0.931943901212494, 0.952369422559957,
0.970504057166626, 0.970504057166626, 0.981840531932726, 0.981840531932726,
0.986385055223408, 0.986385055223408, 0.272100227847001, 0.396807658639251,
0.557778150279687, 0.691580836018451, 0.766410261185922, 0.807248887150547,
0.857126892775325, 0.888876472158586, 0.911543213325635, 0.929684056297455,
0.941020531063555, 0.959155165670224, 0.968231795521285, 0.977302217007196,
0.981840531932726, 0.984112793578067, 0.984112793578067, 0.265302068006432,
0.396789033543797, 0.557784358644838, 0.680244361252351, 0.761871946260391,
0.800444518944826, 0.841264519813997, 0.882078312318017, 0.909277160045445,
0.931950109577645, 0.941014322698404, 0.954623059109845, 0.961421218950414,
0.972763902081665, 0.977302217007196, 0.984112793578067, 0.984112793578067)),
row.names = c(NA, -102L), class = c("tbl_df", "tbl", "data.frame"))
I can generate a heatmap and contour lines based on the dataset presented above.
ggplot(smalltest, aes(x = x, y = y)) +
geom_tile(aes(fill = z)) +
scale_fill_continuous_divergingx(palette = 'RdBu', rev = FALSE, mid = 0.9, l3 = 0, p3 = 0.95, p4 = 0.85) +
scale_x_continuous(expand = c(0, 0), breaks = -8:8) +
scale_y_continuous(expand = c(0, 0), breaks = 0:5) +
geom_contour(aes(z = z), breaks = c(0.8, 0.9, 0.95), color = 'black', size = 1) +
geom_dl(aes(label = c(rep(NA, 99), 'Low', 'Middle', 'High')), method = 'last.points')
I have the following questions:
How do I have the geom_dl() only look/use the ends or the sides of the contour lines for those labels? I added the rep(NA,99) initially as I get an error stating that the length of label must either be equal to 1 or the length of the dataset (102 here). But how can I simply just pass through a vector of custom strings?
How do I extend the contour lines to the ends of the tile?
Thanks!
Created on 2019-11-07 by the reprex package (v0.3.0)

Related

geom_bar(), Y-axis goes way above data value

I am trying to visualize a data frame from a survey. I'm currently trying to plot a barplot with geom_bar(), that takes in "Life Satisfaction" as the y-axis, and "Family Values" as the x-axis. Note that the survey answer for Life Satisfaction is 1(very unsatisfied) to 10(very satisfied).
But for some reason when I try to plot this barplot, the y-axis goes way above 10, and I don't understand why.
This is my code:
df1 %>%
filter(df1$B_COUNTRY_ALPHA == "PAK") %>%
drop_na(Q49) %>%
ggplot(aes(x = Q1, y = Q49, fill = B_COUNTRY_ALPHA)) +
geom_bar(stat = "identity") +
labs(x = "Family Value",
y = "Life Satisfaction")
This is the graph that I get when I run it:
This is the first 20 rows of data that I want to work with:
On a side note: I was thinking of finding the mean of the Life Satisfaction data and maybe that will make the plot make sense but I am not sure how to do that
#GregorThomas I followed your instructions and I got this.
structure(list(B_COUNTRY_ALPHA = c("PAK", "PAK", "PAK", "PAK",
"PAK", "PAK", "PAK", "PAK", "PAK", "PAK", "PAK", "PAK", "PAK",
"PAK", "PAK", "PAK", "PAK", "PAK", "PAK", "PAK"), Q49 = c(7,
10, 10, 5, 1, 6, 6, 10, 10, 10, 4, 4, 8, 10, 10, 10, 10, 9, 10,
8), Q1 = c(1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1), Q2 = c(1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1,
4, 1, 2, 2, 2), Q3 = c(2, 2, 1, 1, 3, 1, 2, 2, 2, NA, 2, 4, 1,
1, 2, 2, 4, 2, 4, 2), Q4 = c(3, 4, 2, 4, 2, 3, 4, 2, 1, 4, 4,
4, 4, 1, 3, 4, 3, 4, 4, 2), Q5 = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 1, 2, 1, 1, 1, 4, 1, 1, 4), Q6 = c(1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4), Q57 = c(2, 2, 2, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1), Q106 = c(7, 5,
10, 4, 10, 7, 1, 10, 10, 10, 1, 10, 1, 10, 10, 10, 9, 4, 10,
6), Q107 = c(7, 6, 5, 5, 10, 3, 1, 10, 10, NA, 1, 1, 1, 10, 3,
10, 10, 8, 10, 4), Q108 = c(7, 9, 1, 4, 1, 1, 10, 10, 5, 10,
10, 10, 1, 10, 10, 10, 10, 10, 1, 3), Q109 = c(6, 4, 1, 4, 1,
1, 1, 10, 10, 1, 6, 2, 10, 5, 10, 1, 10, 9, 1, 4), Q110 = c(6,
3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 10, 1, 10, 3, 1, 3), Q112 =
c(8,
8, 10, 6, 10, 5, 10, 10, 10, 10, NA, 10, 10, 10, 10, 10, 10,
10, 10, 7), Q163 = c(6, 2, 10, 7, 9, 10, 10, 10, 10, NA, 10,
10, 6, 10, 3, NA, 8, 7, NA, 9), Q164 = c(4, 9, 10, 8, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, NA, 8, 10, 10, 10), Q222 = c(2,
1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, NA, 1, NA, 2, 3, NA, 3),
Q260 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 0, 1), Q262 = c(33, 21, 60, 18, 60, 50, 45, 29, 62,
46, 35, 40, 30, NA, 45, NA, 30, 50, 36, 34), Q273 = c(1,
6, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
Q275 = c(0, 2, 3, 3, 3, 2, 3, 2, 4, 0, 0, 0, 1, NA, 3, NA,
1, 1, 0, 1), Q281 = c(8, 0, 3, 0, 10, 3, 4, 6, 3, 8, 4, 4,
4, 0, 5, 0, 0, 0, 9, 0)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L))
Here's a couple ideas using your sample data:
Use a dodged bar plot:
sample_data %>%
ggplot(aes(x = factor(Q1), fill = factor(Q49))) +
geom_bar(position = position_dodge(preserve = 'single')) +
labs(x = "Family Value",
y = "Count of Responses",
fill = "Life Satisfaction")
Use facets:
sample_data %>%
ggplot(aes(x = factor(Q49), fill = factor(Q49))) +
geom_bar() +
labs(x = "Life Satisfaction",
y = "Count of Responses",
fill = "Life Satisfaction") +
facet_wrap(vars(paste("Family Value", Q1)))
Use a heat map:
sample_data %>%
ggplot(aes(x = factor(Q1),y = factor(Q49))) +
geom_bin2d() +
coord_fixed() +
labs(y = "Life Satisfaction", x = "Family Value")

Bootstrapping multiple regression error: number of items to replace is not a multiple of replacement length

I want to bootstrap my dataset for multiple regression. Unfortunately I get this error message:
"number of items to replace is not a multiple of replacement length"
I suspect that the factors in my regression formula may be problematic.
What could I do to solve my problem?
My code is as following (I read Andy FieldĀ“s Discovering Statistics using R):
BootReg <- function(data, indices, formula) {
d <- data[indices,]
fit <- lm(formula, data=d)
return(coef(fit))
}
bootResults <-boot(statistic = BootReg, formula = TICS_Skala1 ~HSPhoch + HSPhoch*extra.c
+ psy + sex + age.c, data = mod.reg.data, R = 2000)
psy (psychiatric disease), sex and HSPhoch (high sensory-processing sensitivity) are factors. TICS_Skala1, extra.c, age.c are continuos variables.
my sample data:
> dput(head(mod.reg.data, 20))
structure(list(neo_01 = c(3, 4, 3, 0, 4, 4, 3, 2, 3, 1, 4, 2,
3, 3, 1, 2, 3, 4, 0, 2), neo_03 = c(1, 1, 1, 3, 1, 2, 0, 0, 0,
0, 0, 0, 1, 3, 1, 1, 1, 1, 3, 1), neo_04 = c(2, 4, 3, 0, 4, 3,
4, 3, 2, 3, 3, 3, 3, 4, 2, 4, 3, 4, 3, 3), neo_08 = c(3, 0, 1,
2, 3, 3, 4, 3, 2, 1, 2, 4, 0, 3, 1, 1, 3, 1, 3, 1), neo_12 = c(3,
1, 1, 2, 2, 2, 4, 1, 1, 2, 1, 4, 1, 3, 1, 1, 3, 2, 3, 2), neo_13 = c(3,
2, 2, 4, 3, 3, 3, 2, 2, 1, 2, 3, 0, 3, 1, 0, 2, 3, 0, 2), neo_16 = c(3,
1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 3, 0, 2, 0, 0, 0, 0, 2, 1), neo_17 = c(2,
1, 3, 0, 1, 1, 1, 4, 3, 1, 2, 2, 2, 3, 1, 0, 2, 0, 2, 2), neo_18 = c(2,
3, 4, 0, 4, 3, 4, 3, 3, 1, 3, 2, 4, 2, 3, 4, 3, 4, 2, 2), neo_21 = c(3,
0, 1, 2, 1, 2, 1, 1, 1, 1, 1, 3, 0, 4, 1, 0, 0, 0, 4, 1), neo_26 = c(3,
0, 0, 0, 2, 1, 3, 0, 1, 1, 0, 2, 3, 3, 0, 0, 1, 1, 4, 1), neo_27 = c(3,
3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 2, 2), TICS_1 = c(3,
0, 3, 2, 2, 1, 3, 3, 1, 2, 0, 4, 2, 3, 2, 3, 4, 1, 3, 2), TICS_2 = c(3,
1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 4, 3, 1, 1, 1, 2, 1, 2, 1), TICS_3 = c(2,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 3, 1, 2, 0, 1, 1, 0, 1, 0), TICS_4 = c(2,
0, 2, 0, 1, 2, 1, 3, 0, 0, 0, 4, 1, 2, 1, 2, 1, 1, 2, 2), TICS_5 = c(2,
3, 2, 1, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1), TICS_6 = c(3,
2, 2, 4, 2, 2, 1, 3, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2), TICS_7 = c(3,
3, 2, 2, 2, 2, 0, 3, 1, 2, 1, 4, 2, 0, 2, 1, 4, 1, 0, 1), TICS_8 =c(NA,
NA, NA, NA, NA, NA, NA, NA, 1, 1, 0, 4, 3, 1, 1, 3, 3, 2, 1,
2), TICS_9 = c(NA, NA, NA, NA, NA, NA, NA, NA, 0, 3, 2, 2, 1,
3, 0, 1, 3, 1, 1, 2), TICS_10 = c(2, 2, 0, 0, 2, 3, 0, 2, 1,
1, 2, 2, 1, 0, 0, 1, 1, 2, 2, 1), TICS_11 = c(1, 2, 1, 0, 1,
1, 0, 0, 0, 0, 2, 4, 1, 0, 0, 0, 0, 1, 1, 0), TICS_12 = c(2,
2, 1, 0, 1, 1, 1, 3, 1, 1, 1, 4, 2, 2, 2, 3, 3, 1, 2, 3), TICS_13=
c(1, 1, 3, 0, 2, 3, 2, 1, 1, 2, 1, 2, 2, 3, 2, 2, 1, 2, 2, 2),
TICS_14= c(4, 1, 1, 0, 1, 1, 3, 4, 0, 2, 0, 4, 2, 3, 0, 1, 3, 1, 1,
1), TICS_15= c(3, 1, 1, 3, 0, 2, 0, 2, 0, 2, 1, 2, 0, 1, 1, 1, 0, 0,
0, 1), ICS_16= c(4, 2, 1, 3, 3, 2, 1, 2, 1, 1, 1, 3, 1, 3, 1, 2, 3,
1, 2, 1), TICS_17= c(3, 0, 2, 2, 1, 2, 2, 3, 0, 1, 1, 2, 1, 2, 2, 3,
1, 1, 1, 2), TICS_18= c(3, 0, 1, 2, 0, 1, 1, 0, 0, 1, 0, 4, 2, 2, 0,
0, 1, 0, 2, 0), TICS_19= c(4, 2, 2, 2, 2, 2, 0, 2, 1, 2, 1, 4, 3, 2,
1, 1, 1, 0, 1, 2), TICS_20= c(2, 0, 2, 0, 0, 0, 1, 0, 1, 1, 0, 4, 1,
1, 0, 0, 1, 0, 2, 0), TICS_21= c(2, 1, 1, 0, 2, 3, 0, 1, 0, 1, 3, 2,
2, 1, 2, 1, 1, 1, 3, 0), TICS_22= c(3, 0, 1, 2, 2, 3, 1, 4, 0, 1, 1,
2, 3, 1, 1, 2, 3, 2, 0, 3), TICS_24= c(2, 0, 0, 1, 0, 0, 2, 0, 1, 1,
0, 2, 0, 0, 0, 1, 1, 0, 0, 1), TICS_25= c(4, 0, 1, 2, 2, 2, 4, 2, 1,
1, 0, 3, 0, 2, 0, 1, 2, 1, 2, 1), TICS_26= c(3, 0, 2, 2, 0, 1, 1, 0,
0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1), TICS_27= c(3,
1, 4, 2, 3, 3, 4, 4, 0, 1, 0, 3, 2, 3, 2, 3, 2, 2, 4, 3), TICS_28=
c(3, 2, 2, 1, 1, 2, 1, 2, 1, 1, 0, 4, 1, 2, 1, 0, 1, 0, 0, 2),
TICS_29= c(2, 0, 1, 0, 2, 2, 1, 0, 1, 0, 0, 4, 1, 1, 0, 1, 0, 0, 1,
1), TICS_30= c(2, 1, 3, 1, 2, 2, 1, 0, 1, 1, 1, 3, 2, 0, 1, 0, 1, 2,
2, 2), TICS_31= c(2, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 3, 2, 1, 0, 0, 1,
0, 2, 1), TICS_32= c(4, 1, 1, 0, 1, 2, 1, 4, 0, 3, 0, 3, 3, 2, 1, 2,
2, 2, 3, 3), TICS_33= c(2,
1, 0, 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 1, 1), TICS_34=
c(1, 3, 0, 0, 2, 1, 1, 1, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0),
TICS_35= c(1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 2, 0, 1, 0, 1, 1, 0, 4,
1), TICS_36= c(4, 1, 2, 3, 3, 2, 4, 1, 0, 1, 2, 3, 1, 3, 0, 1, 1, 0,
2, 1), TICS_37= c(1, 1, 2, 0, 2, 3, 3, 0, 1, 2, 1, 2, 1, 0, 2, 2, 1,
1, 2, 1), TICS_38= c(3, 0, 3, 1, 2, 2, 2, 3, 0, 2, 0, 4, 0, 2, 1, 2,
2, 1, 1, 2), TICS_39= c(1, 1, 2, 2, 3, 1, 1, 2, 1, 1, 1, 4, 1, 1, 1,
1, 3, 0, 0, 3), TICS_40= c(2, 0, 2, 0, 3, 2, 1, 2, 0, 0, 0, 3, 2, 2,
0, 1, 2, 0, 0, 1), TICS_41= c(2, 2, 0, 0, 2, 3, 1, 1, 0, 1, 3, 1, 2,
0, 1, 0, 0, 1, 2, 0), TICS_42= c(1, 2, 0, 0, 2, 1, 0, 0, 0, 1, 1, 2,
1, 1, 1, 0, 0, 0, 0, 0), TICS_43= c(4,
1, 1, 2, 2, 3, 3, 3, 0, 2, 1, 4, 3, 2, 1, 1, 3, 1, 2, 3), TICS_44=
c(3, 0, 2, 1, 2, 2, 3, 3, 0, 1, 0, 4, 1, 3, 0, 2, 2, 1, 3, 1),
TICS_45= c(2,
0, 1, 2, 0, 1, 0, 2, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 1), TICS_46=
c(2, 1, 0, 1, 2, 2, 1, 0, 0, 3, 1, 4, 3, 1, 1, 0, 1, 1, 2, 1),
TICS_47= c(3,
1, 2, 1, 2, 2, 1, 1, 1, 2, 0, 3, 1, 2, 1, 2, 1, 1, 4, 1), TICS_48=
c(1,
2, 3, 1, 2, 3, 1, 1, 0, 2, 2, 4, 2, 3, 2, 2, 1, 0, 2, 0), TICS_49=
c(1,
3, 2, 2, 1, 2, 2, 1, 0, 1, 1, 4, 3, 0, 1, 2, 4, 1, 0, 3), TICS_50=
c(3,
0, 3, 1, 1, 2, 4, 3, 0, 2, 0, 4, 2, 3, 2, 2, 2, 2, 2, 3), TICS_51=
c(1,
2, 0, 0, 2, 1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 0, 0, 0, 0), TICS_52=
c(2,
1, 3, 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 1), TICS_53=
c(2,
2, 2, 0, 2, 3, 1, 1, 0, 2, 2, 3, 2, 2, 2, 1, 1, 1, 2, 1), TICS_54=
c(3,
0, 3, 2, 2, 2, 3, 3, 1, 2, 0, 4, 0, 2, 0, 2, 2, 0, 2, 1), TICS_55=
c(2,
0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 4, 0, 1, 0, 1, 1, 0, 2, 0), TICS_56=
c(4,
3, 1, 0, 2, 0, 0, 0, 1, 0, 1, 2, 1, 1, 1, 0, 0, 0, 2, 0), TICS_57=
c(2,
1, 1, 0, 2, 1, 0, 0, 1, 1, 1, 4, 3, 0, 0, 1, 1, 0, 0, 2), HSPS_1 =
c(3,
4, 3, 3, 4, 2, 4, 2, 4, 2, 3, 4, 2, 2, 4, 2, 3, 3, 5, 2), HSPS_2 =
c(4,
4, 3, 5, 5, 3, 2, 4, 5, 5, 3, 4, 3, 4, 4, 2, 4, 3, 4, 3), HSPS_3 =
c(4,
4, 4, 3, 3, 4, 3, 3, 3, 3, 3, 5, 3, 4, 5, 3, 3, 3, 4, 2), HSPS_4 =
c(4,
2, 1, 4, 2, 3, 5, 3, 5, 2, 3, 3, 3, 4, 3, 3, 4, 2, 5, 2), HSPS_5 =
c(2,
2, 2, 4, 3, 3, 3, 1, 4, 3, 3, 4, 3, 2, 4, 3, 4, 3, 5, 1), HSPS_6 =
c(4,
3, 1, 3, 4, 3, 3, 3, 3, 2, 1, 1, 1, 3, 5, 3, 3, 1, 1, 2), HSPS_7 =
c(4,
3, 1, 3, 4, 2, 3, 1, 4, 3, 2, 4, 1, 1, 5, 3, 3, 1, 5, 1), HSPS_8 =
c(4,
3, 5, 5, 4, 5, 5, 3, 4, 4, 3, 3, 2, 4, 4, 3, 4, 3, 3, 3), HSPS_9 =
c(3,
2, 2, 5, 3, 3, 4, 1, 5, 2, 2, 4, 1, 2, 4, 4, 3, 1, 5, 2), HSPS_10=
c(4,
4, 5, 4, 4, 4, 3, 1, 4, 3, 3, 4, 2, 1, 5, 3, 4, 4, 3, 2), HSPS_11=
c(3,
2, 2, 3, 2, 2, 3, 1, 3, 2, 4, 5, 1, 3, 3, 3, 3, 2, 3, 2), HSPS_12=
c(4,
4, 5, 5, 4, 5, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 4, 4, 5, 4), HSPS_13=
c(3,
2, 3, 2, 2, 2, 5, 2, 3, 2, 3, 4, 3, 3, 3, 3, 4, 2, 5, 2), HSPS_14=
c(3,
2, 2, 3, 3, 3, 5, 3, 3, 2, 3, 3, 2, 3, 2, 3, 3, 2, 4, 2), HSPS_15=
c(4,
4, 2, 3, 4, 3, 3, 3, 4, 2, 3, 3, 5, 2, 4, 2, 3, 3, 3, 2), HSPS_16=
c(2,
2, 1, 5, 2, 3, 2, 2, 3, 3, 3, 5, 2, 3, 3, 3, 2, 2, 5, 2), HSPS_17=
c(4,
3, 4, 5, 3, 4, 4, 2, 4, 3, 5, 4, 4, 4, 5, 4, 5, 2, 5, 4), HSPS_18=
c(2,
2, 1, 2, 1, 2, 2, 1, 3, 2, 2, 5, 2, 1, 4, 3, 2, 1, 5, 1), HSPS_19=
c(3,
2, 2, 4, 2, 2, 3, 1, 4, 2, 2, 4, 1, 1, 4, 3, 2, 2, 5, 2), HSPS_20=
c(4,
4, 4, 3, 4, 3, 5, 3, 3, 3, 4, 3, 3, 4, 4, 3, 5, 3, 5, 2), HSPS_21=
c(3,
3, 4, 5, 3, 3, 5, 2, 4, 2, 3, 5, 4, 4, 3, 2, 3, 2, 5, 2), HSPS_22=
c(3,
5, 5, 4, 5, 4, 3, 2, 4, 3, 3, 5, 3, 2, 4, 2, 4, 3, 5, 2), HSPS_23=
c(2,
2, 1, 4, 2, 3, 4, 3, 3, 2, 2, 5, 3, 3, 3, 3, 3, 2, 5, 3), HSPS_24=
c(3,
2, 2, 3, 3, 3, 3, 2, 4, 2, 3, 5, 4, 2, 4, 4, 4, 3, 4, 2), HSPS_25=
c(3,
2, 2, 5, 3, 3, 5, 1, 4, 2, 3, 5, 3, 2, 4, 3, 3, 2, 5, 2), HSPS_26=
c(2,
1, 1, 3, 3, 3, 3, 2, 3, 2, 2, 5, 2, 2, 3, 3, 3, 2, 5, 2), HSPS_27=
c(2,
2, 1, 4, 3, 2, 3, 4, 3, 1, 4, 1, 1, 3, 4, 2, 3, 2, 5, 3), sex =
structure(c(2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L), .Label = c("m", "w", "d"), class = "factor"), Bildung =
structure(c(6L,
5L, 5L, 6L, 6L, 6L, 5L, 6L, 5L, 6L, 6L, 4L, 6L, 5L, 5L, 6L, 6L,
5L, 5L, 6L), .Label = c("kein", "Haupt", "mittlereR", "Fachabi",
"Abi", "Studium"), class = "factor"), job = structure(c(6L, 2L,
2L, 2L, 2L, 6L, 2L, 6L, 5L, 2L, 2L, 1L, 6L, 2L, 2L, 2L, 6L, 2L,
2L, 6L), .Label = c("hausl", "Student", "Azubi", "Suchend", "Rente",
"berufstaetig"), class = "factor"), age = c(23, 24, 21, 70, 25,
29, 22, 25, 57, 24, 25, 30, 31, 20, 28, 27, 26, 21, 24, 53),
VPN = 1:20, consent = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("ja",
"nein"), class = "factor"), psy = c(0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0), HSPS = c(86, 75,
69, 102, 85, 82, 97, 59, 100, 68, 80, 106, 68, 73, 105, 79,
91, 63, 119, 59), neuro = c(16, 3, 4, 10, 10, 11, 12, 5,
5, 5, 5, 16, 5, 18, 4, 3, 8, 5, 19, 7), extra = c(15, 17,
19, 7, 19, 17, 18, 17, 16, 10, 17, 14, 15, 19, 11, 13, 16,
18, 9, 13), TICS_Skala1 = c(23, 1, 22, 11, 14, 16, 22, 25,
2, 11, 1, 29, 9, 20, 10, 19, 16, 9, 18, 16), TICS_Skala2 = c(14,
12, 11, 9, 11, 10, 4, 10, 5, 8, 5, 24, 13, 5, 6, 6, 14, 2,
1, 13), TICS_Skala3 = c(21, 6, 10, 5, 12, 14, 11, 20, 3,
11, 4, 27, 20, 13, 7, 13, 20, 11, 11, 18), TICS_Skala4 = c(13,
14, 13, 2, 16, 23, 10, 9, 3, 13, 15, 18, 14, 11, 13, 10,
7, 9, 17, 6), TICS_Skala5 = c(12, 2, 6, 5, 3, 5, 8, 3, 4,
6, 0, 18, 3, 7, 1, 6, 6, 1, 13, 3), TICS_Skala6 = c(10, 2,
3, 4, 4, 6, 3, 0, 0, 5, 2, 15, 10, 5, 2, 1, 5, 2, 8, 3),
TICS_Skala7 = c(15, 5, 9, 13, 4, 8, 4, 9, 1, 6, 2, 11, 2,
12, 3, 2, 1, 3, 2, 7), TICS_Skala8 = c(8, 10, 3, 0, 11, 7,
2, 1, 2, 2, 7, 20, 7, 2, 2, 2, 1, 1, 2, 3), TICS_Skala9 = c(12,
3, 4, 8, 8, 6, 9, 5, 2, 6, 5, 11, 3, 11, 1, 5, 9, 3, 7, 5
), TICS_Skala10 = c(32, 5, 18, 16, 19, 18, 21, 16, 5, 17,
7, 39, 12, 24, 3, 15, 20, 6, 25, 14), neuro.c = c(6.08921933085502,
-6.91078066914498, -5.91078066914498, 0.089219330855018,
0.089219330855018, 1.08921933085502, 2.08921933085502,
-4.91078066914498,
-4.91078066914498, -4.91078066914498, -4.91078066914498,
6.08921933085502, -4.91078066914498, 8.08921933085502,
-5.91078066914498,
-6.91078066914498, -1.91078066914498, -4.91078066914498,
9.08921933085502, -2.91078066914498), extra.c = c(5.21003717472119,
7.21003717472119, 9.21003717472119, -2.78996282527881,
9.21003717472119,
7.21003717472119, 8.21003717472119, 7.21003717472119,
6.21003717472119,
0.21003717472119, 7.21003717472119, 4.21003717472119,
5.21003717472119,
9.21003717472119, 1.21003717472119, 3.21003717472119,
6.21003717472119,
8.21003717472119, -0.78996282527881, 3.21003717472119), age.c =
c(-15.4460966542751,
-14.4460966542751, -17.4460966542751, 31.5539033457249,
-13.4460966542751,
-9.4460966542751, -16.4460966542751, -13.4460966542751,
18.5539033457249,
-14.4460966542751, -13.4460966542751, -8.4460966542751,
-7.4460966542751,
-18.4460966542751, -10.4460966542751, -11.4460966542751,
-12.4460966542751, -17.4460966542751, -14.4460966542751,
14.5539033457249), HSP.c = c(-1.92936802973978, -12.9293680297398,
-18.9293680297398, 14.0706319702602, -2.92936802973978,
-5.92936802973978,
9.07063197026022, -28.9293680297398, 12.0706319702602,
-19.9293680297398,
-7.92936802973978, 18.0706319702602, -19.9293680297398,
-14.9293680297398,
17.0706319702602, -8.92936802973978, 3.07063197026022,
-24.9293680297398,
31.0706319702602, -28.9293680297398), HSPhoch = c(1, 0, 0,
1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0)), row.names =
c(NA, 20L), class = "data.frame")

Plotly does not plot a non-quadratic surface

I want to produce a 3D scatterplot and add a surface fitted with a linear regression, using plotly. My data:
structure(list(political_trust = c(1, 6, 7, 5, 0, 2, 1, 3, 5,
0, 2, 5, 5, 6, 6, 3, 3, 2, 5, 8, 3, 7, 3, 4, 5, 4, 5, 0, 0, 4,
6, 1, 0, 4, 0, 5, 5, 6, 7, 3, 5, 4, 5, 2, 4, 4, 7, 6, 7, 5, 4,
6, 7, 5, 7, 3, 3, 3, 2, 5, 2, 7, 3, 2, 7, 2, 3, 0, 7, 5, 7, 3,
0, 7, 2, 6, 3, 8, 7, 2, 2, 5, 0, 1, 6, 3, 6, 5, 1, 3, 4, 4, 5,
3, 3, 0, 2, 4, 9, 6, 3, 3, 2, 3, 4, 5, 8, 0, 4, 1, 5, 0, 4, 0,
5, 6, 3, 2, 7, 5, 4, 3, 8, 3, 4, 0, 3, 6, 7, 7, 2, 3, 5, 5, 5,
0, 3, 2, 1, 7, 5, 0, 4, 0, 2, 7, 3, 0, 8, 3, 2, 4, 5, 5, 3, 2,
3, 8, 6, 5, 6, 7, 0, NA, 7, 7, 2, 0, 3, 4, 7, 2, 1, 2, 0, 0,
4, 3, 3, 6, 6, 1, 4, 0, 4, 0, 0, 7, 6, 4, 4, 6, 5, 4, 3, 3, 0,
NA, 2, 5), political_interest = c(2, 0, 3, 3, 2, 1, 2, 2, 2,
2, 2, 2, 3, 3, 3, 3, 2, 2, 3, 2, 1, 2, 2, 2, 2, 0, 2, 1, 3, 1,
1, 1, 1, 1, 2, 3, 2, 2, 2, 1, 3, 3, 2, 3, 2, 1, 3, 2, 0, 3, 1,
1, 2, 1, 2, 2, 1, 3, 3, 2, 3, 2, 3, 2, 2, 1, 2, 0, 3, 1, 2, 2,
1, 3, 2, 2, 1, 2, 2, 0, 3, 2, 2, 1, 2, 1, 1, 3, 1, 1, 3, 2, 0,
2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2,
2, 0, 0, 2, 3, 2, 2, 2, 3, 3, 0, 3, 3, 1, 2, 1, 1, 1, 2, 3, 2,
2, 2, 0, 2, 2, 2, 1, 2, 3, 3, 1, 2, 0, 1, 1, 0, 2, 2, 1, 2, 2,
2, 2, 3, 2, 1, 2, 2, 0, 0, 3, 2, 2, 2, 1, 2, 3, 0, 1, 2, 3, 2,
2, 2, 1, 3, 1, 1, 2, 2, 3, 3, 1, 2, 2, 2, 2, 2, 1, 0, 1, 1, 0,
3, 3), education_level = c(0, 2, 1, 5, 5, 0, 4, 4, 0, 0, 3, 2,
3, 4, 0, 4, 4, 4, 4, 3, 0, NA, 4, 0, 4, 3, 4, 1, 5, 2, NA, 0,
0, 4, 3, 3, 5, 3, 4, 0, 4, 4, 0, 4, 5, 4, 2, 2, 0, 5, 3, 0, 4,
1, 5, 4, 0, 4, 4, 5, 5, 4, 4, 4, 5, 2, 3, 2, 4, 0, 4, 0, 5, 4,
4, 4, 4, 4, 4, 2, 4, 5, 3, 4, 3, 0, 4, 4, 4, 3, 4, 4, 0, 3, 4,
2, 3, 3, 0, 4, 4, 4, 5, 4, 0, 4, 4, 4, 0, 3, 1, 4, NA, 4, 0,
1, 2, 4, 0, 2, 1, 4, 4, 4, 3, NA, 5, 2, 1, 0, 0, 4, 3, 3, 4,
3, 0, 3, NA, 4, 0, 0, 4, 5, 4, 5, 2, 2, 0, 3, 4, 3, 1, 3, 2,
3, 5, 0, 4, 5, 0, 5, 2, 0, 3, NA, NA, 2, 4, 3, 4, 3, 2, 2, 4,
4, 3, 0, 4, 0, 4, 4, 3, 0, 4, 4, 3, 5, 0, 3, 0, 4, 3, 0, 3, 3,
3, 4, 5, 1)), row.names = c(NA, -200L), class = "data.frame")
I start by defining a list of relevant variables - this is not necessary but basically a consequence of using the code in a Shiny up:
input <- list()
input$x <- "education_level"
input$y <- "political_trust"
input$z <- "political_interest"
Next, creating the surface data:
# Regressing "political_interest" on "education_level" and "political_trust":
lm <- lm(as.formula(paste0(input$z, " ~ ", input$x, " + ", input$y)), data)
# Defining range of values that outcome will be predicted for
axis_x <- seq(min(data[, input$x], na.rm = T),
max(data[, input$x], na.rm = T), by = 0.2)
axis_y <- seq(min(data[, input$y], na.rm = T),
max(data[, input$y], na.rm = T), by = 0.2)
# Predicting outcome, and getting data into surface format
lm_surface <- expand.grid(x = axis_x, y = axis_y, KEEP.OUT.ATTRS = F)
colnames(lm_surface) <- c(input$x, input$y)
lm_surface <- acast(lm_surface, as.formula(paste0(input$x, " ~ ", input$y)),
value.var = input$z)
Last, plotting this with plotly:
data %>%
filter(!is.na(get(input$z))) %>%
filter(!is.na(get(input$x))) %>%
filter(!is.na(get(input$y))) %>%
plot_ly(., x = ~jitter(get(input$x), factor = 2.5),
y = ~jitter(get(input$y), factor = 2.5),
z = ~jitter(get(input$z), factor = 2.5),
type = "scatter3d", mode = "markers",
marker = list(size = 2, color = "#cccccc")) %>%
add_surface(., z = lm_surface,
x = axis_x,
y = axis_y,
type = "surface")
This gives me the following. As you can see, the surface does not cover the full range of the y-dimension. Note also that the surface plotted is "quadratic" - i.e. same length in x and y - although it should have non-quadratic dimensions.
I can bring plotly to draw larger surface area, e.g. by changing the range of values like below, but it always stays quadratic.
axis_x <- seq(0, 10, by = 0.2)
axis_y <- seq(0, 10, by = 0.2)
Ok, question solved.
It's important which dimension of the surface matrix (lm_surface) is which. Swapping x and y when applying acast fixes the issue:
lm_surface <- acast(lm_surface, as.formula(paste0(input$y, " ~ ", input$x)),
value.var = input$z)

Understanding parameters inputting for scale_fill_continuous_divergingx for handling color margins

This question is a continuation of my previous question here.
I have a heatmap with a dataset available. The dataset is pasted below:
library(ggplot2)
library(colorspace)
bigtest <- structure(list(x = c(-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8,
-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7,
-6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7, -6,
-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7, -6, -5,
-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8),
y = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5),
z = c(1281.35043, 576.76381, 403.46607,
363.28815, 363.13356, 335.04997, 246.93314, 191.56371, 165.35087,
165.35087, 136.33712, 83.91203, 107.5773, 56.91087, 56.91089,
54.16559, 54.18172, 1841.60838, 1098.66304, 424.80686, 363.52776,
363.13355, 335.04998, 246.93314, 191.69473, 165.35087, 165.35087,
136.33712, 83.91204, 107.57729, 56.91087, 56.91088, 54.16421,
54.16794, 2012.52217, 1154.7927, 446.79023, 363.31379, 363.13356,
335.04997, 246.93314, 191.9613, 165.35087, 165.35087, 136.33712,
83.91202, 107.57731, 56.91088, 56.91088, 54.1642, 54.16559, 2077.10354,
1217.43403, 450.18301, 363.44225, 363.13357, 363.13363, 253.99753,
218.43223, 165.35087, 165.35014, 136.33712, 83.91203, 107.57822,
82.87399, 56.91087, 54.1642, 54.1642, 2092.56391, 1229.49925,
451.15179, 392.30728, 363.13356, 363.13282, 264.18944, 218.4308,
165.35087, 165.35044, 136.33712, 83.91202, 83.92709, 82.87353,
82.87406, 56.54491, 54.16421, 2206.93318, 1231.66411, 457.37767,
392.41558, 363.13357, 363.13283, 335.06272, 191.95211, 165.35087,
165.35014, 136.33712, 136.35211, 112.12755, 82.73634, 82.87353,
82.87418, 54.16421)),
row.names = c(NA, -102L),
class = c("tbl_df", "tbl", "data.frame"))
I'm generating a heatmap with the following code section:
ggplot(bigtest, aes(x = x, y = y)) +
geom_tile(aes(fill = z)) +
scale_fill_continuous_divergingx(palette = 'RdBu', rev = TRUE, mid = 347.48, l3 = 54, p3 = 2206, p4 = 325)
What I'm expecting from the plot is for the white color to be centered at a specific value and for the other gradients to diverge based on above or below that value. However, by working with the different parameters, it seems I don't fully understand what the parameters l3, p3, and p4 are referring to. When I was reviewing the documentation for this function it suggested that the parameters to customize the scale is from divergingx_hcl function within the colorspace package.
When reviewing the divergingx_hcl documentation it states they're coordinate corresponding to different input parameters. I'm completely lost and fully unaware of what this is. Any guidance on helping me wrap my head around these parameters (not just l3, p3, and p4 but the other parameters) would be greatly appreciated.
Created on 2019-11-07 by the reprex package (v0.3.0)
First, all colors are specified as HCL (hue, chroma, luminance), which correspond to the type of the color (red, green blue, etc.), how colorful a color is (low chroma is gray, high chroma is very colorful), and how light a color is (high luminance is white, low luminance is black).
The parameter l3 indicates the luminance component of the color at one end of the color scale. (l1 is the luminance at the other end, and l2 is the luminance in the middle.) Luminance goes from 0 to 100. So, if you want the color at the end to be darker, set luminance to a lower value. The parameters p3 and p4 are exponents that govern how quickly the colors transition from the midpoint to the endpoint. In general, values closer to 0 mean quicker transitions, and values greater than 1 mean slower transitions. It's unlikely you'll ever want p3 or p4 values greater than 10.
To get the default parameters for a palette, you can use the divergingx_palettes() command:
library(colorspace)
divergingx_palettes('RdBu')
#> HCL palette
#> Name: RdBu
#> Type: Diverging (flexible)
#> Parameter ranges:
#> h1 h2 h3 c1 c2 c3 l1 l2 l3 p1
#> 20 NA 230 60 0 50 20 98 15 1.4
Created on 2019-11-07 by the reprex package (v0.3.0)
This shows you that the color at the end point specified by l3 is already quite dark. Changing l3 from 15 to 0 will make it a bit darker but not by much. Further, p2, p3, and p4 are not specified, which means they're all taken from p1, and hence are 1.4. Thus, color interpolation is somewhat slower than linear.
With this knowledge, the following examples should make sense. To learn more about this, I recommend reading the various articles on the colorspace website: http://colorspace.r-forge.r-project.org/
First the data:
library(ggplot2)
library(colorspace)
bigtest <- structure(list(x = c(-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8,
-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7,
-6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7, -6,
-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -8, -7, -6, -5,
-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8),
y = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5),
z = c(1281.35043, 576.76381, 403.46607,
363.28815, 363.13356, 335.04997, 246.93314, 191.56371, 165.35087,
165.35087, 136.33712, 83.91203, 107.5773, 56.91087, 56.91089,
54.16559, 54.18172, 1841.60838, 1098.66304, 424.80686, 363.52776,
363.13355, 335.04998, 246.93314, 191.69473, 165.35087, 165.35087,
136.33712, 83.91204, 107.57729, 56.91087, 56.91088, 54.16421,
54.16794, 2012.52217, 1154.7927, 446.79023, 363.31379, 363.13356,
335.04997, 246.93314, 191.9613, 165.35087, 165.35087, 136.33712,
83.91202, 107.57731, 56.91088, 56.91088, 54.1642, 54.16559, 2077.10354,
1217.43403, 450.18301, 363.44225, 363.13357, 363.13363, 253.99753,
218.43223, 165.35087, 165.35014, 136.33712, 83.91203, 107.57822,
82.87399, 56.91087, 54.1642, 54.1642, 2092.56391, 1229.49925,
451.15179, 392.30728, 363.13356, 363.13282, 264.18944, 218.4308,
165.35087, 165.35044, 136.33712, 83.91202, 83.92709, 82.87353,
82.87406, 56.54491, 54.16421, 2206.93318, 1231.66411, 457.37767,
392.41558, 363.13357, 363.13283, 335.06272, 191.95211, 165.35087,
165.35014, 136.33712, 136.35211, 112.12755, 82.73634, 82.87353,
82.87418, 54.16421)),
row.names = c(NA, -102L),
class = c("tbl_df", "tbl", "data.frame"))
Now the plots:
ggplot(bigtest, aes(x = x, y = y)) +
geom_tile(aes(fill = z)) +
scale_fill_continuous_divergingx(
palette = 'RdBu', rev = TRUE,
mid = 347.48
)
ggplot(bigtest, aes(x = x, y = y)) +
geom_tile(aes(fill = z)) +
scale_fill_continuous_divergingx(
palette = 'RdBu', rev = TRUE,
mid = 347.48,
p3 = .2,
p4 = .2
)
ggplot(bigtest, aes(x = x, y = y)) +
geom_tile(aes(fill = z)) +
scale_fill_continuous_divergingx(
palette = 'RdBu', rev = TRUE,
mid = 347.48,
l3 = 0,
p3 = .2,
p4 = .2
)
Created on 2019-11-07 by the reprex package (v0.3.0)

R Multiple T-test: Grouping factor must have 2 variables

I'm trying to compare a control group with an experimental group on a range of variable to show that they are similar (baseline).
I thus need to do multiple t-test (unpaired/ Welch t-test). My data is in a long format with the first variable called "Group" with either a number 1 or a number 2. There are some missing values in some of my other variables but it's pretty random.
So when I run t-test manually using this line of code:
t.test(variable_1 ~ Group,df)
it works.
I then tried to do it all at once using this line of code:
sapply(df[,2:71], function(i) t.test(i ~ df$Group)$p.value)
But I get the following error:
grouping factor must have exactly 2 levels
Could anyone help?
Here is what the structure looks like
structure(list(Group = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2), EM_Accuracy_Time_Airport = c(3, 3, 0,
1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 3, 1, 3, 1, 1), EM_Accuracy_Place_Airport = c(2,
2, 1, 2, 1, 2, 2, 1, 1, 2, 0, 2, 2, 0, 2, 2, 2, 1, 1, 1), EM_Accuracy_Expl_Airport = c(2,
2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 0, 2, 2, 1), EM_Accuracy_Death_Airport = c(0,
2, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0), EM_Accuracy_Time_Metro = c(3,
1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 3, 3), EM_Accuracy_Death_Metro = c(3,
0, 1, 0, 1, 1, 0, 0, 0, 3, 0, 0, 1, 0, 3, 1, 1, 1, 0, 0), EM_Accuracy_PC_Time_Airpot = c(100,
100, 0, 33.3333333333333, 33.3333333333333, 66.6666666666667,
66.6666666666667, 33.3333333333333, 33.3333333333333, 100, 100,
66.6666666666667, 66.6666666666667, 66.6666666666667, 33.3333333333333,
100, 33.3333333333333, 100, 33.3333333333333, 33.3333333333333
), EM_Accuracy_PC_Place_Airport = c(100, 100, 50, 100, 50, 100,
100, 50, 50, 100, 0, 100, 100, 0, 100, 100, 100, 50, 50, 50),
EM_Accuracy_PC_Expl_Airport = c(100, 100, 100, 0, 100, 100,
100, 50, 100, 100, 100, 100, 100, 0, 0, 50, 0, 100, 100,
50), EM_Accuracy_PC_Death_Airport = c(0, 66.6666666666667,
0, 0, 33.3333333333333, 66.6666666666667, 0, 0, 0, 0, 0,
0, 66.6666666666667, 0, 0, 0, 100, 0, 0, 0), EM_Accuracy_PC_Time_Metro = c(100,
33.3333333333333, 0, 0, 33.3333333333333, 33.3333333333333,
0, 33.3333333333333, 33.3333333333333, 33.3333333333333,
33.3333333333333, 66.6666666666667, 33.3333333333333, 100,
33.3333333333333, 33.3333333333333, 66.6666666666667, 33.3333333333333,
100, 100), EM_Accuracy_PC_Death_Metro = c(100, 0, 33.3333333333333,
0, 33.3333333333333, 33.3333333333333, 0, 0, 0, 100, 0, 0,
33.3333333333333, 0, 100, 33.3333333333333, 33.3333333333333,
33.3333333333333, 0, 0), EM_ACCURACY_PC = c(83.3333333333333,
66.6666666666667, 30.5555555555556, 22.2222222222222, 47.2222222222222,
66.6666666666666, 44.4444444444444, 27.7777777777778, 36.1111111111111,
72.2222222222222, 38.8888888888889, 55.5555555555555, 66.6666666666666,
27.7777777777778, 44.4444444444444, 52.7777777777778, 55.5555555555556,
52.7777777777778, 47.2222222222222, 38.8888888888889), EM_Certainty_Time_Airport = c(3,
1, 1, 1, 2, 2, 1, 1, 2, 3, 3, 2, 2, 2, 4, 2, 3, 3, 2, 2),
EM_Certainty__Place_Airport = c(3, 4, 2, 2, 2, 2, 4, 1, 3,
4, 4, 4, 4, 3, 3, 4, 4, 3, 2, 3), EM_Certainty__Expl_Airport = c(4,
2, 3, 1, 2, 3, 2, 1, 2, 4, 1, 3, 2, 2, 1, 3, 1, 2, 2, 3),
EM_Certainty__Death_Airport = c(1, 1, NA, 1, 2, 1, 3, 1,
2, 3, NA, 3, 2, 1, 2, 1, 1, 1, 4, 4), EM_Certainty__Time_Metro = c(3,
3, 1, 1, 2, 2, 2, 1, 3, 2, 3, 2, 3, 2, 2, 2, 3, 1, 2, 2),
EM_Certainty__Death_Metro = c(2, 1, 1, NA, 2, 1, 1, 1, 2,
1, NA, 3, 2, 1, 1, 1, 1, 1, 1, 4), EM_CERTAINTY = c(2.66666666666667,
2, 1.6, 1.2, 2, 1.83333333333333, 2.16666666666667, 1, 2.33333333333333,
2.83333333333333, 2.75, 2.83333333333333, 2.5, 1.83333333333333,
2.16666666666667, 2.16666666666667, 2.16666666666667, 1.83333333333333,
2.16666666666667, 3), EM_CONFIDENCE = c(5, 5, 1, 2, 2, 4,
5, 2, 3, 4, 5, 5, 3, 3, 4, 4, 3, 2, 3, 2), FBM_CONFIDENCE = c(4,
6, 7, 7, 5, 4, 2, 7, 5, 6, 6, 7, 6, 7, 3, 6, 6, 4, 5, 6),
FBM_Vividness_Time = c(3, 3, 1, 4, 3, 2, 4, 3, 4, 4, 1, 3,
4, 4, 3, 3, 3, 2, 4, 3), FBM_Vividness_How = c(4, 4, 2, 4,
4, 3, 4, 4, 4, 4, 3, 4, 3, 4, 4, 4, 4, 4, 4, 4), FBM_Vividness_Where = c(4,
4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4),
FBM_Vividness_WithWhom = c(4, 4, 3, 4, 3, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4), FBM_Vividness_WereDoing = c(4,
4, 1, 4, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4),
FBM_Vividness_Did_After = c(4, 4, 3, 4, 2, 3, 4, 4, 2, 4,
1, 4, 4, 4, 3, 4, 4, 3, 4, 4), FBM_VIVIDNESS = c(3.83333333333333,
3.83333333333333, 2, 4, 3.16666666666667, 3.33333333333333,
4, 3.83333333333333, 3.66666666666667, 4, 2.33333333333333,
3.83333333333333, 3.83333333333333, 4, 3.66666666666667,
3.83333333333333, 3.83333333333333, 3.5, 4, 3.83333333333333
), FBM_Details_NB_T2 = c(3, 5, 0, 5, 5, 5, 2, 5, 1, 5, 3,
5, 5, 5, 2, 4, 2, 3, 5, 5), P_Novelty_5 = c(5, 6.2, 6.5,
5.6, 4.8, 5.4, 4, 4.2, 4.4, 5.8, 3.4, 5.8, 6, 5.8, 3.8, 6.4,
6.8, 6.6, 7, 3), P_Suprise_emotion = c(6, 6, 6, 6, 4, 5,
1, 7, 1, 5, 4, 5, 7, 7, 6, 4, 7, 7, 2, 5), P_Surprise_Expected = c(1,
3, 5, 2, 4, 3, 6, 2, 2, 1, 6, 4, 3, 1, 5, 1, 1, 1, 5, 4),
P_Surprise_Unbelievable = c(5, 4, 1, 6, 4, 4, 2, 7, 1, 4,
1, 6, 7, 7, 6, 3, 7, 7, 5, 3), `P_Consequence-Importance_5` = c(5.6,
4.8, 3.4, 5, 4.8, 4, 5, 5.4, 3, 5.2, 6.8, 5.4, 4, 4.4, 6,
3.8, 4, 4.8, 5, 5.2), P_Emotional_Intensity_4 = c(5.25, 5.75,
3, 4.75, 4.75, 6, 4, 5.25, 2.5, 5.5, 7, 6.5, 5.75, 6.75,
6.75, 6, 6.25, 6, 5, 2.5), P_Social_Sharing_6 = c(3.66666666666667,
3.83333333333333, 3.4, 3.16666666666667, 3, 3.33333333333333,
3.8, 3.16666666666667, 2.16666666666667, 4.16666666666667,
4, 4.5, 4.5, 4.33333333333333, 4, 3.16666666666667, 3.66666666666667,
4, NA, NA), P_Media_3 = c(4.66666666666667, 4, 3, 2.66666666666667,
2.66666666666667, 2.33333333333333, 3, 2.33333333333333,
2.33333333333333, 3.33333333333333, 4.33333333333333, 5,
4.33333333333333, 5, 4, 2, 3, 3.33333333333333, 2, 1.66666666666667
), P_Ruminations = c(3, NA, 3, 2, 4, NA, 4, 2, 1, 4, 4, 4,
2, 4, 2, 3, 3, 3, 4, 3), P_Novelty_Common_rev = c(6, 7, 7,
7, 4, 6, 4, 7, 2, 6, 3, 7, 7, 7, 3, 6, 7, 7, 7, 3), P_Novelty_Unusual = c(2,
5, 7, 7, 3, 5, 3, 3, 5, 6, 1, 4, 7, 1, 4, 6, 6, 6, 7, 2),
P_Novelty_Special = c(6, 6, NA, 6, 5, 5, 4, 3, 5, 4, 1, 5,
6, 7, 4, 6, 7, 7, 7, 3), P_Novelty_Singular = c(4, 6, 5,
1, 5, 5, 4, 1, 3, 6, 5, 6, 4, 7, 3, 7, 7, 6, 7, 2), P_Novelty_Ordinary_rev = c(7,
7, 7, 7, 7, 6, 5, 7, 7, 7, 7, 7, 6, 7, 5, 7, 7, 7, 7, 5),
P_Consequence = c(6, 7, 5, 4, 5, 4, 5, 3, 5, 5, 7, 5, 5,
2, 6, 6, 1, 4, 6, 3), P_Importance_self = c(4, 3, 3, 4, 4,
3, 5, 6, 1, 5, 7, 5, 3, 3, 5, 2, 2, 4, 5, 3), `P_Importance_friends&family` = c(4,
4, 3, 4, 4, 4, 4, 6, 1, 5, 6, 5, 3, 3, 5, 2, 6, 4, 5, 10),
P_Importance_Belgium = c(7, 5, 3, 7, 6, 5, 6, 7, 3, 7, 7,
7, 5, 7, 7, 5, 6, 7, 6, 6), P_Importance_International = c(7,
5, 3, 6, 5, 4, 5, 5, 5, 4, 7, 5, 4, 7, 7, 4, 5, 5, 3, 4),
P_Emotional_Intensity_Upset = c(4, 5, NA, 3, 3, 5, 3, 5,
2, 5, 7, 5, 5, 6, 7, 6, 6, 5, 5, 3), P_Emotional_Intensity_Indiferent_rev = c(7,
7, 5, 7, 6, 7, 4, 6, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, NA, 4),
P_Emotional_Intensity_Affected = c(6, 6, 3, 5, 5, 6, 5, 6,
2, 5, 7, 7, 5, 7, 7, 6, 6, 6, NA, 2), P_Emotional_Intensity_Shaken = c(4,
5, 1, 4, 5, 6, 4, 4, 2, 5, 7, 7, 6, 7, 6, 5, 6, 6, 5, 1),
P_Rehearsal_Media_TV = c(5, 3, NA, 3, 2, 3, NA, 1, 1, 4,
3, 5, 5, 5, 2, 3, 2, 2, 2, 2), P_Rehearsal_Media_Internet = c(4,
4, 1, 3, 2, 2, 2, 4, 3, 2, 5, 5, 3, 5, 5, 1, 5, 4, 2, 1),
P_Rehearsal_Media_Social_Networks = c(5, 5, 5, 2, 4, 2, 4,
2, 3, 4, 5, 5, 5, 5, 5, 2, 2, 4, 2, 2), P_Social_Sharing_How_Often = c(4,
5, 4, 4, 4, 3, 3, 3, 3, 5, 4, 5, 5, 5, 5, 3, 4, 4, 5, NA),
P_Social_Sharing_With_How_Many_People = c(5, 4, NA, 3, 3,
3, 3, 3, 2, 5, 3, 5, 5, 3, 5, 3, 3, 4, 3, NA), PK_Shops_YN = c(0,
1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1),
PK_Comic = c(0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 1, 0), PK_Hotel = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0), PK_Decoration_Maelbeek = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1),
PK_Stations_before_after_Maelbeek = c(0, 0.5, 0, 0, 0, 0,
0, 0, 0.5, 1, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0.5, 0), PK_TOTAL_PC = c(0,
50, 0, 40, 40, 40, 20, 0, 10, 60, 20, 40, 90, 70, 20, 0,
30, 20, 70, 40), SI_Attachment_BXL = c(6, 4, 1, 4, 2, 5,
1, 6, 5, 4, 2, 6, 6, 7, 1, 3, 6, 4, 5, 4), SI_Pride_BXL = c(1,
2, 1, 2, 1, 2, 1, 5, 1, 6, 1, 1, 7, 7, 1, 2, 6, 1, 3, 3),
SI_Attachment_Belgium = c(7, 3, 5, 5, 4, 6, 7, 6, 5, 6, 7,
7, 7, 7, 5, 6, 7, 6, 4, 2), SI_Pride_Belgium = c(7, 2, 6,
4, 2, 6, 4, 5, 1, 5, 1, 6, 7, 7, 5, 7, 7, 6, 2, 2), SI_Attachment_EU = c(6,
4, 2, 5, 4, 4, 5, 4, 7, 4, 1, 6, 7, 7, 5, 4, 6, 6, 2, 6),
SI_Pride_EU = c(7, 1, 1, 4, 3, 4, 4, 4, 1, 4, 1, 6, 7, 7,
4, 3, 6, 6, 2, 4)), .Names = c("Group", "EM_Accuracy_Time_Airport",
"EM_Accuracy_Place_Airport", "EM_Accuracy_Expl_Airport", "EM_Accuracy_Death_Airport",
"EM_Accuracy_Time_Metro", "EM_Accuracy_Death_Metro", "EM_Accuracy_PC_Time_Airpot",
"EM_Accuracy_PC_Place_Airport", "EM_Accuracy_PC_Expl_Airport",
"EM_Accuracy_PC_Death_Airport", "EM_Accuracy_PC_Time_Metro",
"EM_Accuracy_PC_Death_Metro", "EM_ACCURACY_PC", "EM_Certainty_Time_Airport",
"EM_Certainty__Place_Airport", "EM_Certainty__Expl_Airport",
"EM_Certainty__Death_Airport", "EM_Certainty__Time_Metro", "EM_Certainty__Death_Metro",
"EM_CERTAINTY", "EM_CONFIDENCE", "FBM_CONFIDENCE", "FBM_Vividness_Time",
"FBM_Vividness_How", "FBM_Vividness_Where", "FBM_Vividness_WithWhom",
"FBM_Vividness_WereDoing", "FBM_Vividness_Did_After", "FBM_VIVIDNESS",
"FBM_Details_NB_T2", "P_Novelty_5", "P_Suprise_emotion", "P_Surprise_Expected",
"P_Surprise_Unbelievable", "P_Consequence-Importance_5", "P_Emotional_Intensity_4",
"P_Social_Sharing_6", "P_Media_3", "P_Ruminations", "P_Novelty_Common_rev",
"P_Novelty_Unusual", "P_Novelty_Special", "P_Novelty_Singular",
"P_Novelty_Ordinary_rev", "P_Consequence", "P_Importance_self",
"P_Importance_friends&family", "P_Importance_Belgium", "P_Importance_International",
"P_Emotional_Intensity_Upset", "P_Emotional_Intensity_Indiferent_rev",
"P_Emotional_Intensity_Affected", "P_Emotional_Intensity_Shaken",
"P_Rehearsal_Media_TV", "P_Rehearsal_Media_Internet", "P_Rehearsal_Media_Social_Networks",
"P_Social_Sharing_How_Often", "P_Social_Sharing_With_How_Many_People",
"PK_Shops_YN", "PK_Comic", "PK_Hotel", "PK_Decoration_Maelbeek",
"PK_Stations_before_after_Maelbeek", "PK_TOTAL_PC", "SI_Attachment_BXL",
"SI_Pride_BXL", "SI_Attachment_Belgium", "SI_Pride_Belgium",
"SI_Attachment_EU", "SI_Pride_EU"), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
The error you get means that there's a problem in your dataset, with at least one of your variables.
Here's a process to help you spot problematic variables:
library(tidyverse)
df %>%
group_by(Group) %>% # for each group value
summarise_all(~sum(!is.na(.))) %>% # count non NA values for each variable
gather(var,value,-Group) %>% # reshape
spread(Group, value, sep = "_") %>% # reshape
filter(Group_2 < 2) # get problematic variables
# # A tibble: 5 x 3
# var Group_1 Group_2
# <chr> <int> <int>
# 1 P_Emotional_Intensity_Affected 18 1
# 2 P_Emotional_Intensity_Indiferent_rev 18 1
# 3 P_Social_Sharing_6 18 0
# 4 P_Social_Sharing_How_Often 18 1
# 5 P_Social_Sharing_With_How_Many_People 17 1
0 counts will throw an error about needing two levels in your grouping variables.
1 count will throw an error about needing more observations in one of your groups.
After spotting those you have to treat them accordingly and then your original t.test code should work.
So my problem was just missing data in one variable.
However, if you are looking at doing multiple T-test in a long format: this line of code works:
sapply(df[,2:71], function(i) t.test(i ~ df$Group)$p.value)

Resources