Stacked barplot using ggplot2 - data visualisation - r

I have very little experience with R and am trying to make a stacked barplot using ggplot2.
I have 2 groups - control and experimental, and 2 choices - red and green. I'm not sure how to organise my data.
There were 80 animals in my trial (control n=40, experimental n=40) and they were given the choice of red and green substrate, I noted which substrate they chose, and that's the data I'm trying to plot.
I would essentially want 'Experimental' and 'Control on the x-axis, and the number of choices on the y-axis (e.g. Control, Red n=20, Control, Green = 12 etc).
Any help would be appreciated!
Edited to add:
This is the graph it's outputting
This is the code I'm using (including suggested adjustments):
df <- data.frame(group = rep(c("control", "experimental"), each = 40),
substrate = sample (c("red","green"), 80, TRUE))
ggplot(df, aes(x = group, y = substrate, fill = substrate)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("red", "green"))
This is the output:
structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("control", "experimental"
), class = "factor"), substrate = structure(c(1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("green",
"red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
output from df(behaviour) - original dataframe
structure(list(group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control", "Experimental"
), class = "factor"), substrate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Green",
"Red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))

Your data:
behaviour=structure(list(group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control", "Experimental"
), class = "factor"), substrate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Green",
"Red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
We can tabulate your data:
table(behaviour$group,behaviour$substrate)
Green Red
Control 10 30
Experimental 27 13
So you can only specify fill or y with geom_bar. In your case, you specify the fill, the geom_bar() function will do the counting for you:
ggplot(behaviour,aes(x=group,fill=substrate))+
geom_bar() + scale_fill_manual(values=c("#29c7ac","#c02739"))

You could have your data like this, with one row for each observation (i.e. each animal), with the group and the substrate recorded for each:
df <- data.frame(group = rep(c("control", "experimental"), each = 40),
substrate = rep(c("green", "red", "green", "red"), c(10, 30, 27, 13)))
Now define your plot using ggplot, specifying group as your x axis, and ..count.. as your y axis. Use geom_bar to get the stacked bars you are looking for, and finally use scale_fill_manual to set the colours:
library(ggplot2)
ggplot(df, aes(x = group, y = ..count.., fill = substrate)) +
geom_bar(colour = "black") +
scale_fill_manual(values = c("green", "red"))

Related

Getting specific combination of interaction as variable in logistic regression with R

I have this dataset and want to perform a regression analysis on it. I have to predictive variables urban_rural and religious. Now I want to have two specific interaction variables: 1.) Urban/not religious and 2.) Rural/religious. I know that interaction is possible through the sign *, but this does not give me the desired combination of interaction. I guess one has to set the reference variable manually?
structure(list(urban_rural = structure(c(1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Urban", "Rural", "Refugee camp"
), class = "factor"), religious = structure(c(2L, 1L, 2L, 2L,
3L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 1L, 3L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 1L, 3L, 1L, 2L, 3L, 2L,
2L, 1L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 3L, 2L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 3L, 2L, 3L, 1L), .Label = c("Religious", "Somewhat religious",
"Not religious"), class = "factor"), family_role_recoded = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("Agree/strongly agree",
"Disagree/strongly disagree", "Don't know"), class = "factor")), row.names = c(NA,
250L), class = "data.frame")
I used these regression models:
model1 <- glm(family_role_recoded ~ urban_rural,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
model2 <- glm(family_role_recoded ~ religious,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
model3 <- glm(family_role_recoded ~ urban_rural + religious,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
Does anyone have an idea how to solve this problem?
If you set the reference for religious to be "Somewhat religious" first. We can look at the results first :
library(broom)
dataset$religious = relevel(dataset$religious,ref="Somewhat religious")
fit0 = glm(family_role_recoded ~ urban_rural*religious,data=dataset,family=binomial())
# A tibble: 6 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.902 0.181 -4.99 6.03e-7
2 urban_ruralRural -0.484 0.532 -0.910 3.63e-1
3 religiousReligious -0.0141 0.456 -0.0308 9.75e-1
4 religiousNot religious 1.47 0.391 3.76 1.67e-4
5 urban_ruralRural:religiousReligious 0.995 1.14 0.876 3.81e-1
6 urban_ruralRural:religiousNot religio… 0.201 0.993 0.203 8.39e-1
You have one of the terms rural/religious. Intuitively, the Urban/Not religious term would be the flip of urban_ruralRural:religiousNot religio. We can also manually define the interaction terms we need:
dataset$Rural_religious = with(dataset,as.numeric(urban_rural=="Rural" & religious=="Religious"))
dataset$Urban_not_religious = with(dataset,as.numeric(urban_rural=="Urban" & religious=="Not religious"))
fit = glm(family_role_recoded ~ 0+urban_rural+religious+Urban_not_religious+Rural_religious,data=dataset,family=binomial())
tidy(fit)
# A tibble: 6 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 urban_ruralUrban -0.902 0.181 -4.99 0.000000603
2 urban_ruralRural -1.39 0.500 -2.77 0.00556
3 religiousReligious -0.0141 0.456 -0.0308 0.975
4 religiousNot religious 1.67 0.913 1.83 0.0667
5 Urban_not_religious -0.201 0.993 -0.203 0.839
6 Rural_religious 0.995 1.14 0.876 0.381
You need to do a post hoc test. For that you can use the R package "emmeans"

Running ggplot inside foreach is slower than running it with just a for loop

I am making a map of a dataset with 20 million data points in ggplot. A single map (with facetting) takes 10-15 mins, so I checked if using multiple cores in parallel mode would work better. Using foreach, maps took more time to make. Some foreach runs were running for 30min-1 hour without producing any results. I know that the answer to this question (Why is the parallel package slower than just using apply?) shows that in parallel computation, sometimes it takes more time to combine the result from the separate parallel processes than running the task itself. But I saw some run examples when I was searching that minute-runs can be improved. Do you think it is possible?
Due to the sheer amount of data I have, I sampled my dataset:
structure(list(lat = c(46.791667, 52.958333, 57.375, 62.625,
74.041667, 60.208333, 30.208333, 56.791667, 57.375, 40.958333,
56.541667, 38.958333, 35.291667, 43.625, 71.375, 66.875, 74.375,
66.458333, 47.791667, 48.041667, 41.541667, 40.875, 57.208333,
64.375, 42.625, 43.958333, 69.958333, 72.375, 36.875, 66.958333,
39.791667, 36.625, 52.625, 65.708333, 42.208333, 53.708333, 35.458333,
58.625, 34.875, 57.291667, 59.708333, 61.708333, 72.041667, 59.958333,
32.208333, 43.625, 39.541667, 62.625, 41.208333, 32.291667, 48.958333,
47.291667, 60.375, 49.458333, 37.208333, 65.708333, 57.958333,
31.041667, 63.875, 43.625, 54.541667, 55.541667, 45.458333, 72.375,
54.708333, 37.958333, 32.375, 60.125, 59.041667, 37.875, 42.958333,
44.375, 59.791667, 49.208333, 34.375, 53.208333, 59.458333, 53.375,
45.458333, 72.125, 66.208333, 60.958333, 47.625, 60.291667, 41.125,
67.541667, 54.625, 55.541667, 37.541667, 44.291667, 44.458333,
40.041667, 49.458333, 39.625, 73.375, 41.458333, 71.375, 31.041667,
66.791667, 42.541667), lon = c(-6.125, -19.541667, -29.291667,
-9.2083333, -11.541667, -6.625, -25.708333, -14.458333, -48.291667,
-63.541667, -41.291667, -12.541667, -48.291667, -58.708333, 6.625,
-2.7083333, -69.375, -19.291667, -27.208333, -36.625, -17.791667,
-50.541667, -38.708333, 9.375, -56.208333, -44.958333, -59.041667,
8.875, -21.125, -24.791667, -40.375, -26.208333, -31.875, -11.875,
-60.958333, -39.125, -32.458333, -54.791667, -44.541667, -37.958333,
-48.625, -10.541667, 3.5416667, -17.791667, -16.041667, -9.9583333,
-32.708333, 0.875, -18.625, -54.208333, -63.125, -56.458333,
-55.125, -22.708333, -40.958333, -56.208333, -33.625, -69.125,
-58.125, -17.541667, -23.541667, -17.041667, -18.458333, -64.791667,
-25.208333, -35.875, -44.791667, -11.291667, -58.291667, -46.208333,
-41.208333, -5.0416667, -38.208333, -13.875, -55.291667, -17.291667,
-48.625, -38.791667, -59.375, -19.291667, 5.5416667, -19.625,
-41.375, -66.291667, -17.625, -14.208333, -39.291667, -48.875,
-16.541667, -21.375, -46.375, 2.625, -60.291667, -40.375, 2.4583333,
-16.458333, 4.875, -66.291667, -4.2916667, -36.458333), entity = structure(c(2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L), .Label = c("Cc", "Li"), class = "factor"), watcon = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 2L), .Label = c("calm", "stormy"), class = "factor"),
step = structure(c(1L, 3L, 2L, 4L, 4L, 2L, 3L, 3L, 3L, 4L,
2L, 2L, 3L, 2L, 1L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L,
4L, 1L, 2L, 2L, 4L, 2L, 4L, 4L, 3L, 4L, 2L, 2L, 1L, 2L, 2L,
1L, 4L, 3L, 2L, 3L, 3L, 2L, 4L, 1L, 3L, 2L, 2L, 3L, 4L, 4L,
2L, 2L, 3L, 4L, 1L, 3L, 2L, 1L, 4L, 2L, 3L, 1L, 3L, 1L, 2L,
1L, 4L, 4L, 3L, 1L, 1L, 1L, 3L, 1L, 2L, 2L, 4L, 3L, 4L, 4L,
4L, 1L, 2L, 1L, 4L, 3L, 4L, 2L, 4L, 1L, 4L, 1L, 2L, 2L, 4L
), .Label = c("abundance", "enccomb", "adscomb", "infcomb"
), class = "factor")), row.names = c(NA, -100L), class = "data.frame")
The code I am using is:
library(oceanmap)
library(sf)
library(ggmap)
library(rnaturalearth)
library(rnaturalearthdata)
library(rgeos)
library(tidyverse)
world <- ne_countries(scale = "medium", returnclass = "sf")
library(doParallel)
library(foreach)
cl <- makeCluster(5)
doParallel::registerDoParallel(cl)
entities <- unique(sample$entity)
foreach(i=1:length(entities), .packages = c("tidyverse", "dplyr")) %dopar% {
ggplot (data=world) + geom_sf(color="white", fill="white") + coord_sf(xlim = c(-34,-30), ylim = c(53,62) , expand = FALSE) + geom_point(subset(sample, sample$entity==entities[i]), mapping=aes(x = lon, y = lat, color = log10(value))) + facet_grid(watcon~step2) +
ggtitle(entities[i])
ggsave(filename = paste0(i,".png"))
}
stopCluster(cl)
P.S. I am using ggplot because of the opportunities for editing the map itself. Lattice is faster but lacks the customization that I am looking for.
Any help on how to improve my foreach code or if it's even possible is greatly appreciated!

Difference between densityPlot and geom_density

I am plotting the same data with both geom_density and base R's densityPlot:
head(df)
min_dist iteration
<dbl> <fct>
1 -79277 1
2 -68987 1
3 -98661 1
4 -98145 1
5 -222408 1
6 -217409 1
geom_density:
ggplot(df) + geom_density(aes(min_dist)) + xlim(c(-1e4, 1e4)) + geom_rug(aes(min_dist))
densityPlot:
densityPlot(df$min_dist, xlim = c(-1e4, 1e4))
And they are strikingly different!
I was under the impression that geom_density was a wrapper for stat_density.
Is this just a matter of different adjust values?
df <- structure(list(min_dist = c(-79277, -68987, -98661, -98145, -222408,
-217409, -10759, -10363, 48034, 48525, 617038, 617069, -19656,
-17396, -479, -333, -48721, -47691, -19761, 380, 31429, 33953,
-9220, -61971, -609, -560, 6228, 13082, -217137, -216639, 99268,
102365, -119574, -53795, -329407, -111401, 1018733, -559898,
-1346, -15, -71577, -2883, 19490, -12321, -12240, -8470, -29804,
28724, 40673, -57818, -41421, -39126, 136783, 165727, -52989,
-51058, 58995, -12226, 8057, 44217, 13280, 16828, -80410, 11324,
975, 2630, 14250, 17108, -29341, -27033, -79277, -68987, -98661,
-98145, -222408, -217409, -10759, -10363, 48034, 48525, 617038,
617069, -19656, -17396, -479, -333, -48721, -47691, -19761, 380,
31429, 33953, -9220, -61971, -609, -560, 6228, 13082, -217137,
-216639, 99268, 102365, -119574, -53795, -329407, -111401, 1018733,
-559898, -1346, -15, -71577, -2883, 19490, -12321, -12240, -8470,
-29804, 28724, 40673, -57818, -41421, -39126, 136783, 165727,
-52989, -51058, 58995, -12226, 8057, 44217, 13280, 16828, -80410,
11324, 975, 2630, 14250, 17108, -29341, -27033), iteration = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor")), row.names = c(NA, -140L), class = c("tbl_df",
"tbl", "data.frame"))

Standard evaluation with mutate_ to calculate percentages by group

I am trying to use standard evaluation with dplyr to calculate percents as a function of two grouping variables. The problem is in my mutate_ statement.
Here is a dataset:
structure(list(
var1 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L
),
.Label = c("No", "Yes"), class = "factor"),
var2 = structure(c(2L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L
),
.Label = c("Female", "Male"), class = "factor")),
.Names = c("var1", "var2"), row.names = c(NA, -100L), class = "data.frame")
Here is the code I am working with:
for_plots = function(data, var1, var2){
grouped_data = data %>% group_by_(var1, var2) %>%
summarise_(n_in_group = ~n()) %>%
mutate_(.dots = setNames(list(
interp(quote(n_in_group / sum(n_in_group, na.rm = TRUE) * 100),
n_in_group = as.name(n_in_group)))
))
return(grouped_data)
}
When I run the code, I receive an error:
Error in setNames(list(interp(quote(n_in_group/sum(n_in_group, na.rm = TRUE) * :
argument "nm" is missing, with no default
Any thoughts?
Here is some code based on #Frank's response:
for_plots = function(data, var1, var2) {
grouped_data = data %>% group_by_(var1, var2) %>%
summarise_(n_in_group = ~n()) %>%
mutate(percent = (n_in_group / sum(n_in_group, na.rm = TRUE)) * 100)
return(grouped_data)
}

Bar chart for factorial designs in R

I'm currently trying to create a clustered bar chart using ggplot2. It's basically just mean response times for a 2x2x2 factorial design. The three factors are load, compatibility and salience. I'm having a hard time jamming the third factor (salience) in there though. It shouldn't be a stacked graph though
This is what I currently have
bar+stat_summary(fun.y = mean, geom = "bar", position = "dodge") +
+ stat_summary(fun.data = mean_cl_normal, geom = "errorbar", position = position_dodge(width = 0.90), width = 0.2)+
+ labs(x = "Compatibility", y = "Mean RT", fill = "Load")
Here's a small sample of the data I'm trying to graph:
ID load comp sal rt
1 1 High Incompatible Non_Salient 787
2 1 Low Compatible Salient 754
3 2 High Incompatible Salient 654
I've seen graphs like these numerous times before but I have no idea how to get ggplot2 to display three independent variables at the same time.
I've tried splitting the graphs by adding
+ facet_wrap( ~ sal)
but that doesn't work either. It just says "Invalid argument to unary operator"
Any help would be appreciated.
Is this the kind of plot you are looking for?
I used the Wii data from the book "Discovering Statistics Using R", which is in a similar format to yours.
structure(list(athlete = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Athlete", "Non-Athlete"), class = "factor"),
stretch = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No Stretching", "Stretching"
), class = "factor"), wii = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Playing Wii",
"Watching Wii"), class = "factor"), injury = c(2L, 2L, 1L,
2L, 0L, 1L, 2L, 0L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 0L, 0L, 3L,
3L, 3L, 2L, 1L, 0L, 2L, 2L, 3L, 2L, 2L, 3L, 1L, 2L, 4L, 1L,
2L, 2L, 2L, 1L, 4L, 4L, 1L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L,
2L, 2L, 1L, 0L, 3L, 3L, 2L, 1L, 2L, 4L, 1L, 2L, 5L, 5L, 3L,
6L, 4L, 3L, 4L, 5L, 5L, 2L, 6L, 4L, 4L, 4L, 3L, 4L, 3L, 2L,
1L, 4L, 3L, 2L, 2L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 7L, 8L, 6L,
9L, 4L, 7L, 5L, 9L, 6L, 4L, 8L, 5L, 4L, 7L, 10L, 1L, 3L,
2L, 1L, 3L, 3L, 2L, 3L, 4L, 2L, 0L, 1L, 3L, 2L, 0L)),
.Names = c("athlete", "stretch", "wii", "injury"),
class = "data.frame", row.names = c(NA, -120L))
Here is how to produce the plot.
library(ggplot2)
library(Hmisc)
ggplot(data=Wii, aes(x=stretch, y=injury, fill=wii)) +
facet_wrap(~athlete) +
stat_summary(fun.y = mean, geom = "bar", position = "dodge") +
stat_summary(fun.data = mean_cl_normal, geom = "errorbar", position = position_dodge(width = 0.90), width = 0.2)

Resources