Related
I have this input:
structure(list(Topic = structure(c(1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1",
"2", "3"), class = "factor"), Sex = structure(c(1L, 1L, 1L, 2L,
2L, 2L), .Label = c("Female", "Male"), class = "factor"), Count = c(2L,
15L, 23L, 7L, 20L, 34L)), class = "data.frame", row.names = c(NA,
-6L))
and I try this code:
ggplot(data=dat, aes(x=Topic, y=Sex, fill=Sex)) + geom_bar(stat="identity")
However the problem is it shows the same proposion for every topic
How is it possible to make the plot like this one?
What about this?
dat %>%
ggplot(aes(Topic, Count)) +
geom_bar(aes(fill = Sex), stat = "identity", position = "dodge")
I am trying to show the significance levels within a group consisting of two factors, but I seem to always get the significance levels between groups which is not what I want.
df <- structure(list(Datum = structure(c(2L, 1L, 3L, 1L, 1L, 3L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 3L, 1L, 2L), .Label = c("2021-04-08",
"2021-05-17", "2021-07-07"), class = "factor"), Soll = c("1202",
"172", "119", "1192", "119", "1189", "1189", "552", "1189", "1192",
"2484", "119", "1189", "1189", "172", "552", "1192", "172", "1189",
"172"), Plot = c("6", "5", "3", "4", "6", "5", "4", "5", "7",
"8", "3", "6", "6", "1", "8", "3", "1", "3", "8", "4"), Entfernung = structure(c(2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 2L), .Label = c("2", "5"), class = "factor"), Behandlung = structure(c(1L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L), .Label = c("a", "b"), class = "factor"), DGUnkraut = c(3.5,
0, 2.8, 3, 0.3, 2, 1, 3, 0, 0.3, 10, 0, 1.7, 2.5, 0.2, 0.3, 9,
0.3, 2.5, 0.2)), class = "data.frame", row.names = c(NA, -20L
))
This what I have tried so far:
library(tidyverse)
library(ggsignif)
df %>% group_by (Datum, Entfernung)%>%
ggplot(., aes(Entfernung, DGUnkraut , color = Datum)) +
geom_boxplot()+
geom_signif(comparisons =list (c("2","5")),
map_signif_level = T)
So I would like to see the significant differences between "2" and "5" for each of the three dates, so for example that the significance level of the red boxplot with the date "2021-04-08" and Entfernung = "2" is compared to the one where Entfernung = "5".
Facets don't seem to work with {ggsignif}, but you could fake them, by looping over your dates, and then patching the plots together.
Below one way
library(ggsignif)
library(patchwork)
df %>%
split(., .$Datum) %>%
map(~{
ggplot(., aes(Entfernung, DGUnkraut , color = Datum)) +
geom_boxplot()+
geom_signif(comparisons =list(c("2","5")),
map_signif_level = T) +
scale_x_discrete(drop = FALSE)
}) %>%
wrap_plots() + plot_layout(guides = "collect")
Please find my data p below. I had to include 100 samples to reproduce the error.
Question: why is geom_text not printing consistently center-aligned above the geom_col - e.g. 21 and 28 in All in the right SSA-facet? I tried adjusting position.dodge2 and vjust, but that did not work.
This thread addressed the issue but did not solve my problem.
My script
ggplot(p %>%
mutate(nystudie=as.character(study),
best.resp =as.factor(response)) %>%
group_by(nystudie,best.resp) %>%
summarise(N=n(),Val=unique(treatment)) %>%
bind_rows(p %>% filter(response %in% 1:4, treatment!="Control") %>% droplevels() %>%
mutate(nystudie=as.character(study),
best.resp =as.factor(response)) %>%
group_by(best.resp,treatment) %>% summarise(N=n()) %>%
mutate(nystudie="All") %>%
rename(Val=treatment)),
aes(nystudie, N, color = best.resp, fill= best.resp)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0.1)) +
facet_wrap(~Val,ncol = 2, scales="free") +
scale_fill_grey(name="") +
scale_color_grey(name="") +
scale_y_continuous(breaks = seq(0,120,20)) +
geom_text(aes(label=N),position = position_dodge2(.5), vjust=0, fontface=2, cex=4.5, show.legend = F) +
theme(strip.background = element_blank(),
strip.text = element_text(color = "black", size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
plot.margin = unit(c(1,3,1,1), "lines"))
Data
p <- structure(list(study = structure(c(8L, 12L, 12L, 12L, 4L, 4L,
1L, 11L, 11L, 13L, 1L, 13L, 14L, 9L, 9L, 10L, 12L, 11L, 4L, 11L,
11L, 12L, 8L, 11L, 13L, 11L, 6L, 15L, 6L, 4L, 7L, 13L, 11L, 4L,
1L, 6L, 1L, 11L, 16L, 1L, 10L, 15L, 1L, 11L, 1L, 6L, 1L, 11L,
12L, 11L, 13L, 16L, 1L, 8L, 11L, 10L, 4L, 4L, 12L, 10L, 6L, 15L,
12L, 14L, 12L, 1L, 1L, 16L, 12L, 12L, 8L, 7L, 1L, 1L, 13L, 13L,
14L, 9L, 14L, 2L, 11L, 4L, 1L, 16L, 15L, 11L, 9L, 4L, 13L, 12L,
6L, 16L, 4L, 1L, 15L, 6L, 4L, 1L, 9L, 2L), .Label = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
"15", "22"), class = "factor"), response = c("1", "3", "4", "4",
"3", "3", "3", "4", "4", "4", "4", "4", "3", "4", "4", "4", "3",
"4", "4", "4", "4", "3", "1", "4", "4", "4", "3", "4", "3", "3",
"4", "4", "4", "3", "4", "4", "4", "4", "4", "3", "4", "4", "3",
"4", "4", "3", "3", "4", "3", "4", "4", "4", "4", "3", "3", "4",
"4", "3", "3", "4", "3", "4", "4", "4", "3", "3", "4", "4", "4",
"4", "2", "4", "4", "4", "4", "4", "3", "4", "3", "3", "4", "4",
"4", "4", "4", "4", "3", "3", "4", "4", "3", "4", "4", "4", "4",
"3", "3", "4", "2", "3"), treatment = structure(c(2L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L), .Label = c("SSTR", "SSA"), class = "factor")), row.names = c(NA,
-100L), class = "data.frame")
When adding labels you have to take care to use the same positioning as for geom_col. To align the labels with the bars use position_dodge2(preserve = "single", width = .9, padding = 0.1):
library(ggplot2)
library(dplyr)
d1 <- p %>%
mutate(
nystudie = as.character(study),
best.resp = as.factor(response)
) %>%
group_by(nystudie, best.resp) %>%
summarise(N = n(), Val = unique(treatment))
#> `summarise()` regrouping output by 'nystudie' (override with `.groups` argument)
d2 <- p %>%
filter(response %in% 1:4, treatment != "Control") %>%
droplevels() %>%
mutate(
nystudie = as.character(study),
best.resp = as.factor(response)
) %>%
group_by(best.resp, treatment) %>%
summarise(N = n()) %>%
mutate(nystudie = "All") %>%
rename(Val = treatment)
#> `summarise()` regrouping output by 'best.resp' (override with `.groups` argument)
d <- bind_rows(d1, d2)
ggplot(d, aes(nystudie, N, color = best.resp, fill = best.resp)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0.1)) +
facet_wrap(~Val, ncol = 2, scales = "free") +
scale_fill_grey(name = "") +
scale_color_grey(name = "") +
scale_y_continuous(breaks = seq(0, 120, 20)) +
geom_text(aes(label = N), position = position_dodge2(preserve = "single", width = .9, padding = 0.1), vjust = 0, fontface = 2, cex = 4.5, show.legend = F) +
theme(
strip.background = element_blank(),
strip.text = element_text(color = "black", size = 15),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
plot.margin = unit(c(1, 3, 1, 1), "lines")
)
I have a dataset contains a variable nr.employed. Its numeric.
I am normalizing it in using code
markting_train_dim_deleted =
"","custAge","profession","marital","schooling","default","contact","month","campaign","previous","poutcome","cons.price.idx","cons.conf.idx","euribor3m","nr.employed","pmonths","pastEmail","responded"
"1",0.486842105263158,"1","3","7","2","1","8",0,0,"2",0.389321901792677,0.368200836820084,0.806393108138744,5195.8,999,0,"1"
"2",0.342105263157895,"2","2","1","1","1","4",0,0,"2",0.669134840218243,0.338912133891213,0.980729993198821,5228.1,999,0,"1"
"3",0.315789473684211,"10","2","4","1","2","7",0,0,"2",0.698752922837102,0.602510460251046,0.95737927907504,5191,999,0,"1"
"4",0.486842105263158,"5","1","1","2","1","4",0.0256410256410256,0,"2",0.669134840218243,0.338912133891213,0.981183405123555,5228.1,999,0,"1"
"5",0.215870043275927,"1","1","7","1","1","7",0.102564102564103,0.166666666666667,"1",0.26968043647701,0.192468619246862,0.148945817274994,5099.1,999,1,"1"
"6",0.381578947368421,"2","2","1","1","2","7",0,0,"2",0.698752922837102,0.602510460251046,0.95737927907504,5191,999,0,"1"
cnames=c("custAge","campaign","previous","cons.price.idx","cons.conf.idx",
"euribor3m"," nr.employed","pmonths","pastEmail")
for(i in cnames){
print(i)
print(markting_train_dim_deleted[,i])
markting_train_dim_deleted[,i]=
(markting_train_dim_deleted[,i]-min(markting_train_dim_deleted[,i]))/
(max(markting_train_dim_deleted[,i]-min(markting_train_dim_deleted[,i])))
}
After processing euribor3m it is printing nr.employed, it throws exception
Error in `[.data.frame`(markting_train_dim_deleted, , i) :
undefined columns selected
I have looked at the structure. Its a numeric datatype with no missing values.
output
dput(head(markting_train_dim_deleted))
structure(list(custAge = c(0.486842105263158, 0.342105263157895,
0.315789473684211, 0.486842105263158, 0.215870043275927, 0.381578947368421
), profession = structure(c(1L, 2L, 10L, 5L, 1L, 2L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"), class = "factor"),
marital = structure(c(3L, 2L, 2L, 1L, 1L, 2L), .Label = c("1",
"2", "3", "4"), class = "factor"), schooling = structure(c(7L,
1L, 4L, 1L, 7L, 1L), .Label = c("1", "2", "3", "4", "5",
"6", "7", "8"), class = "factor"), default = structure(c(2L,
1L, 1L, 2L, 1L, 1L), .Label = c("1", "2", "3"), class = "factor"),
contact = structure(c(1L, 1L, 2L, 1L, 1L, 2L), .Label = c("1",
"2"), class = "factor"), month = structure(c(8L, 4L, 7L,
4L, 7L, 7L), .Label = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class = "factor"), campaign = c(0, 0, 0,
0.0256410256410256, 0.102564102564103, 0), previous = c(0,
0, 0, 0, 0.166666666666667, 0), poutcome = structure(c(2L,
2L, 2L, 2L, 1L, 2L), .Label = c("1", "2", "3"), class = "factor"),
cons.price.idx = c(0.389321901792677, 0.669134840218243,
0.698752922837102, 0.669134840218243, 0.26968043647701, 0.698752922837102
), cons.conf.idx = c(0.368200836820084, 0.338912133891213,
0.602510460251046, 0.338912133891213, 0.192468619246862,
0.602510460251046), euribor3m = c(0.806393108138744, 0.980729993198821,
0.95737927907504, 0.981183405123555, 0.148945817274994, 0.95737927907504
), nr.employed = c(5195.8, 5228.1, 5191, 5228.1, 5099.1,
5191), pmonths = c(999, 999, 999, 999, 999, 999), pastEmail = c(0L,
0L, 0L, 0L, 1L, 0L), responded = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("1", "2"), class = "factor")), .Names = c("custAge",
"profession", "marital", "schooling", "default", "contact", "month",
"campaign", "previous", "poutcome", "cons.price.idx", "cons.conf.idx",
"euribor3m", "nr.employed", "pmonths", "pastEmail", "responded"
), row.names = c(NA, 6L), class = "data.frame")
The mistake is simply having " nr.employed" (with a space) rather than "nr.employed" in cnames.
Also, something like
markting_train_dim_deleted[, cnames] <- sapply(markting_train_dim_deleted[, cnames],
function(x) (x - min(x)) / (max(x) - min(x)))
would make the normalization easier to read.
I have two sets of data, all in one data frame. The first set is related to data collected in Location 1 and the second set is collected in Location 2. Each location has different count data (column value) for 5 months.
# DataSet
-----------------
rp_data <- structure(list(Month = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("1",
"2", "3", "4", "5"), class = "factor"), location = c("1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2"), value = c(0L, 1L, 1L, 1L,
2L, 1L, 0L, 0L, 1L, 1L, 3L, 2L, 1L, 4L, 1L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 0L, 2L, 4L, 3L, 5L, 5L, 0L, 4L, 3L, 3L, 4L, 2L, 5L,
2L, 3L, 10L, 6L, 5L, 6L, 4L, 6L, 4L, 5L, 6L, 5L, 3L, 7L, 1L,
1L, 1L, 1L, 0L, 0L, 2L, 1L, 2L, 0L, 2L, 3L, 4L, 1L, 2L, 1L, 2L,
0L, 2L, 2L, 4L, 4L, 5L, 1L, 4L, 5L, 4L, 5L, 1L, 4L, 3L, 7L, 7L,
4L, 2L, 5L, 4L, 1L, 5L, 3L, 7L, 3L, 4L, 8L, 5L, 7L, 1L, 1L, 6L,
3L)), .Names = c("Month", "location", "value"), row.names = c(NA,
-100L), class = "data.frame")
I used this example below, as illustrated on the ggridges examples webpage, to display the various count values across different months.
# Plot 1 , filtering data related to location = 1
#---------------
ggplot(rp_data[rp_data$location == '1',], aes(x = value, y = Month, group = Month)) +
geom_density_ridges2(aes(fill = Month), stat = "binline", binwidth = 1, scale = 0.95) +
geom_text(stat = "bin",
aes(y = group + 0.95*(..count../max(..count..)),
label = ifelse(..count..>0, ..count.., "")),
vjust = 1.4, size = 3, color = "white", binwidth = 1) +
scale_x_continuous(breaks = c(0:12), limits = c(-.5, 13), expand = c(0, 0),
name = "random value") +
scale_y_discrete(expand = c(0.01, 0), name = "Month",
labels = c("5.0", "4.0", "3.0", "2.0", "1.0")) +
scale_fill_cyclical(values = c("#0000B0", "#7070D0")) +
labs(title = "Poisson random samples location 1 different Month",
subtitle = "sample size n=10") +
guides(y = "none") +
theme_ridges(grid = FALSE) +
theme(axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5))
# Plot 2 , filtering data related to location = 2
#---------------
ggplot(rp_data[rp_data$location == '2',], aes(x = value, y = Month, group = Month)) +
geom_density_ridges2(aes(fill = Month), stat = "binline", binwidth = 1, scale = 0.95) +
geom_text(stat = "bin",
aes(y = group + 0.95*(..count../max(..count..)),
label = ifelse(..count..>0, ..count.., "")),
vjust = 1.4, size = 3, color = "white", binwidth = 1) +
scale_x_continuous(breaks = c(0:12), limits = c(-.5, 13), expand = c(0, 0),
name = "random value") +
scale_y_discrete(expand = c(0.01, 0), name = "Month",
labels = c("5.0", "4.0", "3.0", "2.0", "1.0")) +
scale_fill_cyclical(values = c("#0000B0", "#7070D0")) +
labs(title = "Poisson random samples location 2 different Month",
subtitle = "sample size n=10") +
guides(y = "none") +
theme_ridges(grid = FALSE) +
theme(axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5))
Result for plot 1:
My question is how can I combine these two plots, sort of like an overlay plot as shown in this example:
I don't want to plot them in two separate plots.
You need to create a grouping variable that contains both Month and location. You can do that by using paste0(Month, location). For now, I'm leaving out the text labels, though they may be possible with a little more thought as well. (But I think they'd make the figure too busy.)
ggplot(rp_data,
aes(x = value, y = Month,
group = paste0(Month, location),
fill = paste0(Month, location))) +
geom_density_ridges2(stat = "binline", binwidth = 1,
scale = 0.95, alpha = 0.7) +
scale_x_continuous(breaks = c(0:12), limits = c(-.5, 13),
expand = c(0, 0), name = "random value") +
scale_y_discrete(expand = c(0.01, 0), name = "Month",
labels = c("5.0", "4.0", "3.0", "2.0", "1.0")) +
scale_fill_cyclical(values = c("#0000B0", "#B00000",
"#7070D0", "#FC5E5E")) +
labs(title = "Poisson random samples location 1 different Month",
subtitle = "sample size n=10") +
guides(y = "none") +
theme_ridges(grid = FALSE, center = TRUE)
Edit: Now with text labels.
ggplot(rp_data, aes(x = value, y = Month, group = paste0(Month, location), fill = paste0(Month, location))) +
geom_density_ridges2(stat = "binline", binwidth = 1, scale = 0.95, alpha = 0.7) +
geom_text(stat = "bin",
aes(y = ceiling(group/2) + 0.95*(..count../max(..count..)),
label = ifelse(..count..>0, ..count.., ""), color = location),
vjust = 1.4, size = 3, binwidth = 1, fontface = "bold") +
scale_x_continuous(breaks = c(0:12), limits = c(-.5, 13), expand = c(0, 0),
name = "random value") +
scale_y_discrete(expand = c(0.01, 0), name = "Month",
labels = c("5.0", "4.0", "3.0", "2.0", "1.0")) +
scale_fill_cyclical(values = c("#0000B0", "#B00000", "#7070D0", "#FC5E5E")) +
scale_color_cyclical(values = c("white", "black")) +
labs(title = "Poisson random samples location 1 different Month",
subtitle = "sample size n=10") +
guides(y = "none") +
theme_ridges(grid = FALSE, center = TRUE)
Again, not sure it's a good idea, but there you go.