Facet Wrap Issue with ggplot Rain Cloud Plot - r

I'm very new to R and am trying to facet_wrap raincloud plots. I am trying to facet_wrap by Hypothesis chosen (which has been binary coded), so ideally would like to plot proportion of confirmatory and disconfirmatory leads chosen by hypothesis.
Here is what I have so far:
my_data2 <- melt(my_data, id.vars = c("ID"),
measure.vars = c("Proportion.of.Disconfirmatory.Leads.Chosen","Proportion.of.Confirmatory.Leads.Chosen", "Hypothesis"),
variable.name = "Leads", "Hyp",
value.name = "Proportion")
plot3 <- ggplot(data = my_data2, aes(y = Proportion, x = Leads, fill = Leads)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
geom_point(aes(y = Proportion, color = Leads), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
geom_boxplot(width = .1, guides = FALSE, outlier.shape = NA, alpha = 0.5) +
facet_wrap(vars(Hypothesis), nrow = 2)+
expand_limits(x = 5.25) +
guides(fill = FALSE) +
guides(color = FALSE) +
scale_color_brewer(palette = "Spectral") +
scale_fill_brewer(palette = "Spectral") +
coord_flip() +
theme_bw()
plot3
However, I am receiving this error:
"Error: At least one layer must contain all faceting variables: `Hypothesis`.
* Plot is missing `Hypothesis`
* Layer 1 is missing `Hypothesis`
* Layer 2 is missing `Hypothesis`
* Layer 3 is missing `Hypothesis`
* Layer 4 is missing `Hypothesis`"
> dput(my_data)
structure(list(ID = c(2L, 5L, 23L, 34L, 35L, 48L, 53L, 59L, 71L,
76L, 1L, 3L, 4L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 24L, 25L, 26L, 27L, 28L, 29L, 30L,
31L, 32L, 33L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
46L, 47L, 49L, 50L, 51L, 52L, 54L, 55L, 56L, 57L, 58L, 60L, 61L,
62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 72L, 73L, 74L, 75L,
78L), Hypothesis = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), Sum.of.Disconfirmatory.Leads.Chosen = c(9L, 7L, 0L,
3L, 4L, 1L, 2L, 3L, 6L, 3L, 2L, 3L, 5L, 3L, 4L, 3L, 3L, 5L, 0L,
5L, 5L, 1L, 4L, 5L, 6L, 4L, 5L, 2L, 6L, 4L, 6L, 1L, 4L, 4L, 8L,
3L, 4L, 2L, 5L, 2L, 4L, 7L, 1L, 1L, 2L, 3L, 5L, 2L, 5L, 8L, 0L,
5L, 4L, 7L, 3L, 4L, 6L, 1L, 1L, 4L, 4L, 8L, 7L, 3L, 4L, 6L, 2L,
5L, 2L, 5L, 5L, 8L, 2L, 4L, 5L, 7L), Sum.of.Confirmatory.Leads.Chosen = c(5L,
2L, 2L, 2L, 8L, 3L, 4L, 5L, 4L, 2L, 4L, 6L, 3L, 7L, 4L, 3L, 2L,
3L, 3L, 7L, 4L, 5L, 2L, 3L, 6L, 4L, 9L, 6L, 5L, 5L, 1L, 1L, 3L,
6L, 6L, 3L, 7L, 1L, 2L, 3L, 6L, 8L, 2L, 2L, 6L, 9L, 5L, 6L, 5L,
4L, 6L, 6L, 2L, 3L, 2L, 5L, 6L, 4L, 5L, 4L, 5L, 4L, 5L, 7L, 4L,
5L, 4L, 4L, 3L, 5L, 5L, 7L, 6L, 4L, 3L, 7L), Proportion.of.Disconfirmatory.Leads.Chosen = c(64.28571429,
77.77777778, 0, 60, 33.33333333, 25, 33.33333333, 37.5, 60, 60,
33.33333333, 33.33333333, 62.5, 30, 50, 50, 60, 62.5, 0, 41.66666667,
55.55555556, 16.66666667, 66.66666667, 62.5, 50, 50, 35.71428571,
25, 54.54545455, 44.44444444, 85.71428571, 50, 57.14285714, 40,
57.14285714, 50, 36.36363636, 66.66666667, 71.42857143, 40, 40,
46.66666667, 33.33333333, 33.33333333, 25, 25, 50, 25, 50, 66.66666667,
0, 45.45454545, 66.66666667, 70, 60, 44.44444444, 50, 20, 16.66666667,
50, 44.44444444, 66.66666667, 58.33333333, 30, 50, 54.54545455,
33.33333333, 55.55555556, 40, 50, 50, 53.33333333, 25, 50, 62.5,
50), Proportion.of.Confirmatory.Leads.Chosen = c(35.71428571,
22.22222222, 100, 40, 66.66666667, 75, 66.66666667, 62.5, 40,
40, 66.66666667, 66.66666667, 37.5, 70, 50, 50, 40, 37.5, 100,
58.33333333, 44.44444444, 83.33333333, 33.33333333, 37.5, 50,
50, 64.28571429, 75, 45.45454545, 55.55555556, 14.28571429, 50,
42.85714286, 60, 42.85714286, 50, 63.63636364, 33.33333333, 28.57142857,
60, 60, 53.33333333, 66.66666667, 66.66666667, 75, 75, 50, 75,
50, 33.33333333, 100, 54.54545455, 33.33333333, 30, 40, 55.55555556,
50, 80, 83.33333333, 50, 55.55555556, 33.33333333, 41.66666667,
70, 50, 45.45454545, 66.66666667, 44.44444444, 60, 50, 50, 46.66666667,
75, 50, 37.5, 50)), class = "data.frame", row.names = c(NA, -76L
))
> head(my_data)
ID Hypothesis Sum.of.Disconfirmatory.Leads.Chosen Sum.of.Confirmatory.Leads.Chosen
1 2 0 9 5
2 5 0 7 2
3 23 0 0 2
4 34 0 3 2
5 35 0 4 8
6 48 0 1 3
Proportion.of.Disconfirmatory.Leads.Chosen Proportion.of.Confirmatory.Leads.Chosen
1 64.28571 35.71429
2 77.77778 22.22222
3 0.00000 100.00000
4 60.00000 40.00000
5 33.33333 66.66667
6 25.00000 75.00000
I suspect that I have introduced the variable Hypothesis incorrectly in the code, however I have no idea where or how! I have tried to include it in sumld however am receiving this error when doing so:
Error in fs[[1]](x, ...) : attempt to apply non-function
Thank you all in advance for your help.

Hypothesis is not a independent column in your melted data frame mydata2, and cannot be used for faceting. You have included Hypothesis in measure.vars and it has been converted to one of the categories of Leads in mydata2.
my_data2 %>% group_by(Leads) %>% summarize(n=n())
# A tibble: 3 x 2
Leads n
<fct> <int>
1 Proportion.of.Disconfirmatory.Leads.Chosen 76
2 Proportion.of.Confirmatory.Leads.Chosen 76
3 Hypothesis 76
If you want to use it for faceting, include Hypothesis in id.vars instead.
my_data2 <- melt(my_data, id.vars = c("ID", "Hypothesis"),
measure.vars = c("Proportion.of.Disconfirmatory.Leads.Chosen",
"Proportion.of.Confirmatory.Leads.Chosen"),
variable.name = "Leads",
value.name = "Proportion")
head(my_data2)
ID Hypothesis Leads Proportion
1 2 0 Proportion.of.Disconfirmatory.Leads.Chosen 64.28571
2 5 0 Proportion.of.Disconfirmatory.Leads.Chosen 77.77778
3 23 0 Proportion.of.Disconfirmatory.Leads.Chosen 0.00000
4 34 0 Proportion.of.Disconfirmatory.Leads.Chosen 60.00000
5 35 0 Proportion.of.Disconfirmatory.Leads.Chosen 33.33333
6 48 0 Proportion.of.Disconfirmatory.Leads.Chosen 25.00000
Now you can use Hypothesis for faceting:
library(ggplot2)
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
plot3 <- ggplot(data = my_data2, aes(y = Proportion, x = Leads, fill = Leads)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
geom_point(aes(y = Proportion, color = Leads),
position = position_jitter(width = .15), size = .5, alpha = 0.8) +
geom_boxplot(width = .1, guides = FALSE, outlier.shape = NA, alpha = 0.5) +
facet_wrap(~Hypothesis, nrow = 2) +
expand_limits(x = 5.25) +
guides(fill = FALSE) +
guides(color = FALSE) +
scale_color_brewer(palette = "Spectral") +
scale_fill_brewer(palette = "Spectral") +
coord_flip() +
theme_bw()
plot3
Edited: Solution to follow-up question on how to modifying variable names on axis labels. One approach is by factoring variable and assigning labels to factors.
my_data2$Leads <- factor(my_data2$Leads,
levels=c("Proportion.of.Disconfirmatory.Leads.Chosen",
"Proportion.of.Confirmatory.Leads.Chosen"),
labels=c("Proportion of Confirmatory Leads Chosen",
"Proportion of Disconfirmatory Leads Chosen"))
Rerun ggplot code to produce this:

Related

Batch processing and export of a list of CSV files in R

I have 300 CSV files with same structure in a folder based on separate village names. I need to read each file individually, process those, and export output files in another folder with respective village names (e.g., 'village name'_score).
Here are the data for an example village file...
structure(list(ID_GC = structure(1:51, .Label = c("492K", "494K",
"497K", "498K", "499K", "500K", "501K", "502K", "503K", "504K",
"506K", "507K", "508K", "509K", "510K", "511K", "512K", "513K",
"514K", "516K", "517K", "518K", "519K", "522K", "523K", "524K",
"526K", "527K", "528K", "530K", "531K", "532K", "533K", "534K",
"535K", "536K", "537K", "538K", "539K", "540K", "541K", "542K",
"543K", "544K", "545K", "546K", "547K", "548K", "550K", "551K",
"552K"), class = "factor"), Lat = c(23.78107, 23.78115, 23.78122,
23.78123, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068, 23.78071,
23.78075, 23.78043, 23.78021, 23.77937, 23.77985, 23.77981, 23.77995,
23.77987, 23.7799, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906,
23.77899, 23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855,
23.77852, 23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797,
23.77788, 23.77786, 23.77809, 23.77815, 23.77771, 23.77757, 23.77772,
23.77752, 23.7774, 23.7772, 23.77869, 23.78084, 23.78178), Long = c(90.65016,
90.64968, 90.6497, 90.64969, 90.64972, 90.64996, 90.64987, 90.64989,
90.64924, 90.64921, 90.65, 90.64998, 90.6494, 90.64989, 90.64978,
90.64973, 90.64952, 90.64958, 90.64925, 90.64935, 90.6492, 90.64922,
90.64919, 90.64928, 90.64937, 90.64887, 90.64919, 90.64891, 90.64914,
90.64903, 90.64907, 90.6491, 90.64868, 90.6491, 90.64853, 90.64862,
90.64851, 90.64852, 90.64865, 90.64865, 90.64878, 90.64878, 90.64866,
90.64859, 90.64844, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994,
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"),
Depth_m = c(18, 18, 18, 210, 18, 31.5, 13.5, 15, 13.5, 21,
13.5, 18, 15, 240, 24, 13.5, 19.5, 33, 156, 14.4, 18, 21,
13.5, 18, 18, 51, 48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66,
54, 31.5, 21, 210, 66, 12, 54, 27, 219, 18, 18, 18, 18, 18,
21), As_ug_L = c(68L, 68L, 68L, 2L, 68L, 306L, 129L, 129L,
20L, 68L, 188L, 129L, 68L, 2L, 68L, 68L, 129L, 188L, 2L,
2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L, 20L, 2L, 2L, 2L,
2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L, 2L, 306L, 2L, 306L,
20L, 306L, 68L, 68L, 306L, 68L, 20L)), class = "data.frame", row.names = c(NA,
-51L))
And another dataset ("dtw_BG") that will be needed for the calculation of all villages...
structure(list(ID_GC = structure(c(10L, 11L, 12L, 13L, 14L, 8L,
9L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 1L, 2L, 3L, 4L, 5L, 6L,
7L), .Label = c("1002F", "1008F", "1016F", "1029F", "1051F",
"1053F", "1058F", "1548D", "1561D", "498K", "509K", "514K", "540K",
"545K", "559K", "560K", "569K", "571K", "597K", "601K", "614K",
"819F", "829F", "933F", "934F", "951F", "957F", "958F", "959F",
"960F", "964F", "973F", "982F", "998F"), class = "factor"), Lat = c(23.78123,
23.77937, 23.7799, 23.77786, 23.77772, 23.77439336, 23.77204886,
23.77484, 23.775, 23.77528, 23.77492, 23.77521, 23.77593, 23.7757,
23.78494, 23.78473, 23.78385611, 23.78395451, 23.78426992, 23.78374538,
23.78377154, 23.78360725, 23.78340944, 23.78362259, 23.78272036,
23.78307399, 23.78269739, 23.78252464, 23.78279102, 23.78131262,
23.78149057, 23.77867098, 23.77828323, 23.78592929), Long = c(90.64969,
90.64989, 90.64925, 90.64865, 90.64844, 90.65543457, 90.65292302,
90.65158, 90.65192, 90.65219, 90.65232, 90.65363, 90.65356, 90.65483,
90.65025, 90.65238, 90.64900976, 90.64933908, 90.65082989, 90.64891814,
90.64902199, 90.64910447, 90.64933699, 90.6488857, 90.64921562,
90.64848103, 90.64799873, 90.64826494, 90.64738669, 90.64781684,
90.64612672, 90.64499055, 90.64476985, 90.6499865), Village = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L), .Label = c("Abdullapur", "Chauthar Kanda", "Nagra Para Faitadi",
"Nowa Para"), class = "factor"), Depth_m = c(210, 240, 156, 210,
219, 225, 195, 299.7, 299.7, 240, 240, 234, 240, 105, 165, 180,
180, 225, 180, 210, 195, 201, 180, 195, 210, 210, 195, 180, 225,
180, 108, 210, 225, 240), As_ug_L = c(2L, 2L, 2L, 2L, 20L, 2L,
2L, 2L, 20L, 2L, 2L, 7L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-34L))
I need to process all the villages but I am not certain how to loop them in. So far I was able to read all the individual village files using "readr" package.
library(readr)
a <- list.files(path = "/Users/......",
pattern = "*.csv", full.names = T)
Here is the code I'm using for an individual village:
dtw_BG<- read.csv('/Users/...../dtw_BG.csv',header=TRUE)
gw<-read.csv('/Users/....../Abdullapur.csv',header=TRUE)
stw = gw[gw$Depth_m <= 90,]
stw_R = gw[gw$Depth_m <= 90 & gw$As_ug_L > 50,]
itw = gw[gw$Depth_m >= 45 & gw$Depth_m <= 90,]
itw_10 = gw[gw$Depth_m >= 45 & gw$Depth_m <= 90 & gw$As_ug_L <= 10,]
p<-stw [,c(3,2)]
R<-stw_R[,c(3,2)]
ITW<-itw[,c(3,2)]
ITW_10<- itw_10[,c(3,2)]
BG<-dtw_BG[,c(3,2)]
dist_R<- lapply(1:length(p[[1]]), function (i) distGeo (R, p[i,]))
dist_R<-lapply(1:length(p[[1]]), function (i) data.frame(R, dist_R[[i]]))
dist_R100<-lapply(1:length(p[[1]]),function (i) dist_R[[i]][dist_R[[i]][,3] <= 100,])
maxscore<- lapply(1:length(p[[1]]), function(i) nrow (dist_R100[[i]]))
maxscore<-unlist(maxscore)
dist_ITW<- lapply(1:length(p[[1]]), function (i) distGeo (ITW, p[i,]))
dist_ITW<-lapply(1:length(p[[1]]), function (i) data.frame(ITW, dist_ITW[[i]]))
dist_ITW100<-lapply(1:length(p[[1]]),function (i) dist_ITW[[i]][dist_ITW[[i]][,3] <= 100,])
count_itw<- lapply(1:length(p[[1]]), function(i) nrow (dist_ITW100[[i]]))
count_itw<-unlist(count_itw)
if (nrow(ITW_10)==0) {
count_itw10<- rep(0, length(maxscore))
} else {
dist_ITW10<- lapply(1:length(p[[1]]), function (i) distGeo (ITW_10, p[i,]))
dist_ITW10<-lapply(1:length(p[[1]]), function (i) data.frame(ITW_10, dist_ITW10[[i]]))
dist_ITW10_100<-lapply(1:length(p[[1]]),function (i) dist_ITW10[[i]][dist_ITW10[[i]][,3] <= 100,])
count_itw10<- lapply(1:length(p[[1]]), function(i) nrow (dist_ITW10_100[[i]]))
count_itw10<-unlist(count_itw10)
}
dist_BG<- lapply(1:length(p[[1]]), function (i) distGeo (BG, p[i,]))
dist_BG<-lapply(1:length(p[[1]]), function (i) data.frame(BG, dist_BG[[i]]))
dtw<-lapply(1:length(p[[1]]), function(i) {
lapply(1: length(maxscore), function(j) {
min(distGeo( c(dist_R100[[i]][j,1], dist_R100[[i]][j,2]), dist_BG[[i]]))
}
)
}
)
dtw<-unlist(dtw)
dtw<-split(dtw, (0:length(dtw) %/% length(p[[1]])))
dtw <- dtw[-length (dtw)]
count<-lapply(1:length(dtw), function(i) length(subset(dtw[[i]], dtw[[i]]<=100)))
count<-unlist(count)
score<-maxscore-count
abc<-cbind (stw, maxscore, count, score, count_itw, count_itw10)
abc<- data.frame (abc)
write.csv (abc, "/Users/..../Output/Abdullapur_score.csv", row.names = F)
The output for the provided village should look like
structure(list(ID_GC = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L, 48L, 49L, 50L, 51L
), .Label = c("492K", "494K", "497K", "498K", "499K", "500K",
"501K", "502K", "503K", "504K", "506K", "507K", "508K", "509K",
"510K", "511K", "512K", "513K", "514K", "516K", "517K", "518K",
"519K", "522K", "523K", "524K", "526K", "527K", "528K", "530K",
"531K", "532K", "533K", "534K", "535K", "536K", "537K", "538K",
"539K", "540K", "541K", "542K", "543K", "544K", "545K", "546K",
"547K", "548K", "550K", "551K", "552K"), class = "factor"), Lat = c(23.78107,
23.78115, 23.78122, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068,
23.78071, 23.78075, 23.78043, 23.78021, 23.77985, 23.77981, 23.77995,
23.77987, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906, 23.77899,
23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855, 23.77852,
23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797, 23.77788,
23.77809, 23.77815, 23.77771, 23.77757, 23.77752, 23.7774, 23.7772,
23.77869, 23.78084, 23.78178), Long = c(90.65016, 90.64968, 90.6497,
90.64972, 90.64996, 90.64987, 90.64989, 90.64924, 90.64921, 90.65,
90.64998, 90.6494, 90.64978, 90.64973, 90.64952, 90.64958, 90.64935,
90.6492, 90.64922, 90.64919, 90.64928, 90.64937, 90.64887, 90.64919,
90.64891, 90.64914, 90.64903, 90.64907, 90.6491, 90.64868, 90.6491,
90.64853, 90.64862, 90.64851, 90.64852, 90.64865, 90.64878, 90.64878,
90.64866, 90.64859, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994,
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"),
Depth_m = c(18, 18, 18, 18, 31.5, 13.5, 15, 13.5, 21, 13.5,
18, 15, 24, 13.5, 19.5, 33, 14.4, 18, 21, 13.5, 18, 18, 51,
48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66, 54, 31.5, 21, 66,
12, 54, 27, 18, 18, 18, 18, 18, 21), As_ug_L = c(68L, 68L,
68L, 68L, 306L, 129L, 129L, 20L, 68L, 188L, 129L, 68L, 68L,
68L, 129L, 188L, 2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L,
20L, 2L, 2L, 2L, 2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L,
306L, 2L, 306L, 306L, 68L, 68L, 306L, 68L, 20L), maxscore = c(10L,
11L, 11L, 11L, 12L, 12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L,
10L, 9L, 10L, 9L, 10L, 10L, 10L, 7L, 7L, 5L, 7L, 6L, 9L,
9L, 9L, 8L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 9L, 9L, 8L, 8L, 8L,
8L, 6L, 7L, 12L, 3L), count = c(10L, 11L, 11L, 11L, 12L,
12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L, 10L, 9L, 10L, 9L,
9L, 9L, 9L, 6L, 6L, 4L, 6L, 5L, 8L, 8L, 8L, 7L, 8L, 8L, 8L,
8L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 6L, 6L, 12L, 3L),
score = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L), count_itw = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 6L, 7L, 7L, 8L, 8L,
9L, 10L, 10L, 12L, 12L, 12L, 12L, 13L, 11L, 13L, 10L, 10L,
10L, 10L, 12L, 12L, 6L, 6L, 5L, 5L, 2L, 12L, 0L, 0L), count_itw10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 3L, 4L, 4L, 5L, 5L, 6L, 7L, 7L, 9L, 9L, 9L, 9L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 6L, 6L, 5L, 5L, 2L,
9L, 0L, 0L)), class = "data.frame", row.names = c(1L, 2L,
3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L,
48L, 49L, 50L, 51L))
How can I export each village file with respect to its name?
thank you in advance :)
Simply generalize your process in a defined method that receives the village file name as parameter. Then build a list of data frames by iterating through file names and calling your method:
# COMMON VARIABLES
output_path <- "/Users/..../Output/"
dtw_BG <- read.csv('/Users/...../dtw_BG.csv', header=TRUE)
BG <- dtw_BG[,c(3,2)]
# OUTPUT CSV AND RETURN DATA FRAME
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
#... REST OF CODE
write.csv(abc, paste0(output_Path, stw$Village[[1]], ".csv"), row.names = FALSE)
return(abc)
}
# PASS FILE NAMES ITERATIVELY TO BULLD LIST OF DFs (WITH EACH CSV)
v_files <- list.files(path = "/path/to/inputs", pattern = "*.csv",
full.names = TRUE)
df_list <- lapply(v_files, calc_score)
By the way, possibly much of your code can be tightened up as distGeo can receive a matrix of Lon and Lat coordinates. Also, consider a cross join merge (i.e., all pairwise matches) between each pairing of data frames and p to cut down on the repetitious lapply calls. For code maintainability, try to use column names instead of numbers.
NOTE: Below needs testing on full data and is shown as example.
calc_score <- function(village_file) {
gw <- read.csv(village_file, header=TRUE)
### DATA FRAME SUBSETS
stw <- gw[gw$Depth_m <= 90,]
p <- stw[, c("Long", "Lat")]
R <- gw[gw$Depth_m <= 90 & gw$As_ug_L > 50, c("Long", "Lat")]
ITW <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90, c("Long", "Lat")]
ITW_10 <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90 & gw$As_ug_L <= 10, c("Long", "Lat")]
### MAX SCORE CALCULATION
cj <- merge(R, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_R <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_R100 <- subset(dist_R, Distance <= 100)
maxscore <- aggregate(cbind(Score=Distance) ~ Long_ + Lat_, dist_R100, FUN=length)$Score
### COUNT ITW100 CALCULATION
cj <- merge(ITW, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_ITW <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_ITW100 <- subset(dist_ITW, Distance <= 100)
count_itw <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW100, FUN=length)$Count
### COUNT ITW10 CALCULATION
if (nrow(ITW_10)==0) {
count_itw10 <- rep(0, length(maxscore))
} else {
cj <- merge(IT_10, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_ITW10 <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
dist_ITW10_100 <- subset(dist_ITW10, Distance <= 100)
count_itw10 <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW10_100, FUN=length)$Count
}
### MINIMUM DISTANCE
cj <- merge(BG, p, by=NULL, suffixes=c("", "_")) # CROSS JOIN OF ALL ROWS BETWEEN DFs
dist_BG <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
mdf <- merge(dist_R100, dist_BG, by=c("Long_", "Lat_"),
suffixes=c("", "_")) # MERGE AT p LEVEL
dtw <- transform(mdf, Distance = distGeo(mdf[c("Long", "Lat")], mdf[c("Long_", "Lat_")]))
dtw <- aggregate(Distance ~ Long + Lat, dtw, FUN=min)$Distance
### SCORE CALCULATION
dtw <- unlist(dtw)
dtw <- split(dtw, (0:length(dtw) %/% length(p[[1]])))
dtw <- dtw[-length (dtw)]
count <- sapply(dtw, function(d) length(d[d<=100]))
score <- maxscore - count
### FINAL DATA FRAME
village_df <- cbind.data.frame(stw, maxscore, count, score, count_itw, count_itw10)
write.csv(village_df, paste0(output_Path, village_df$Village[[1]], ".csv"), row.names = FALSE)
return(village_df)
}

How to create a faceted boxplot with the significant differences, and 2 measured variables?

I managed to create a faceted boxplot with my 2 quantitative variables;
I know how to run a kruskal-wallis followed by a Wilcoxon test and show the significant differences with letters in the boxplot but only in a simple boxplot, with one variable and without facet. How can I do ?
(If possible, I would like to put the siginificant differences with letters, I wish I would be able to post the pictures of what I already done but apparently I'm not allowed)
Also, I have another question; Which test does the function stat_function_mean execute ? I tried to use this function, but I don't know how to use it... Here is my code without the test, only the facetted boxplot with my two variables :
Code for my facet boxplot with 2 measured variables ( FF and FM)
dat.m2 <- melt(pheno,id.vars=c("fusion","Genotype","Hormone"),
measure.vars=c('FF','MF'))
dat.m2$fusion<-factor(dat.m2$fusion, levels=c("Control", "CK 20 mg/L", "CK 100 mg/L", "CK 500 mg/L", "GA 20 mg/L", "GA 100 mg/L", "GA 500 mg/L"))
levels(dat.m2$fusion)
ggplot(dat.m2) +
geom_boxplot(aes(x=fusion, y=value, colour=variable))+
facet_wrap(~Genotype)+
xlab(" ")+
ylab("Days after sowing")
Code to add significant differences on the graph, with letters, but with only 1 measured variable (FF), without facet
mymat <-tri.to.squ(pp$p.value)
mymat
myletters <- multcompLetters(mymat,compare="<=",threshold=0.05,Letters=letters)
myletters
myletters_df <- data.frame(fusion=names(myletters$Letters),letter = myletters$Letters )
myletters_df
ggplot(pheno, aes(x=fusion, y=FF, colour=fusion))+
geom_boxplot()+
geom_text(data = myletters_df, aes(label = letter, y = 30 ), colour="black", size=5)+
ylab("Days after sowing")+
xlab("")+
labs(title="Days to female flower production")+
theme(plot.title = element_text(hjust = 0.5))+
> dput(pheno)
structure(list(Genotype = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("F1045",
"FF", "M1585", "M1610"), class = "factor"), X = structure(c(1L,
105L, 116L, 127L, 138L, 149L, 160L, 171L, 182L, 2L, 13L, 24L,
35L, 46L, 57L, 68L, 79L, 90L, 101L, 106L, 107L, 108L, 109L, 110L,
111L, 112L, 113L, 114L, 115L, 117L, 118L, 119L, 120L, 121L, 122L,
123L, 124L, 125L, 126L, 128L, 129L, 130L, 131L, 132L, 133L, 134L,
135L, 136L, 137L, 139L, 140L, 141L, 142L, 143L, 144L, 145L, 146L,
147L, 148L, 150L, 151L, 152L, 153L, 154L, 155L, 156L, 157L, 158L,
159L, 161L, 162L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L,
172L, 173L, 174L, 175L, 176L, 177L, 178L, 179L, 180L, 181L, 183L,
184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 47L, 48L,
49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 58L, 59L, 60L, 61L, 62L,
63L, 64L, 65L, 66L, 67L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L,
77L, 78L, 80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 91L,
92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 102L, 103L, 104L
), .Label = c("H1", "H10", "H100", "H101", "H102", "H103", "H104",
"H105", "H106", "H107", "H108", "H109", "H11", "H110", "H111",
"H112", "H113", "H114", "H115", "H116", "H117", "H118", "H119",
"H12", "H120", "H121", "H122", "H123", "H124", "H125", "H126",
"H127", "H128", "H129", "H13", "H130", "H131", "H132", "H133",
"H134", "H135", "H136", "H137", "H138", "H139", "H14", "H140",
"H141", "H142", "H143", "H144", "H145", "H146", "H147", "H148",
"H149", "H15", "H150", "H151", "H152", "H153", "H154", "H155",
"H156", "H157", "H158", "H159", "H16", "H160", "H161", "H162",
"H163", "H164", "H165", "H166", "H167", "H168", "H169", "H17",
"H170", "H171", "H172", "H173", "H174", "H175", "H176", "H177",
"H178", "H179", "H18", "H180", "H181", "H182", "H183", "H184",
"H185", "H186", "H187", "H188", "H189", "H19", "H190", "H191",
"H192", "H2", "H20", "H21", "H22", "H23", "H24", "H25", "H26",
"H27", "H28", "H29", "H3", "H30", "H31", "H32", "H33", "H34",
"H35", "H36", "H37", "H38", "H39", "H4", "H40", "H41", "H42",
"H43", "H44", "H45", "H46", "H47", "H48", "H49", "H5", "H50",
"H51", "H52", "H53", "H54", "H55", "H56", "H57", "H58", "H59",
"H6", "H60", "H61", "H62", "H63", "H64", "H65", "H66", "H67",
"H68", "H69", "H7", "H70", "H71", "H72", "H73", "H74", "H75",
"H76", "H77", "H78", "H79", "H8", "H80", "H81", "H82", "H83",
"H84", "H85", "H86", "H87", "H88", "H89", "H9", "H90", "H91",
"H92", "H93", "H94", "H95", "H96", "H97", "H98", "H99"), class = "factor"),
Hormone = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("CK", "Control", "GA"), class = "factor"),
Hormone.quantity = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("100", "20", "500", "Control"
), class = "factor"), fusion = structure(c(4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("CK 100 mg/L",
"CK 20 mg/L", "CK 500 mg/L", "Control", "GA 100 mg/L", "GA 20 mg/L",
"GA 500 mg/L"), class = "factor"), Sowing.date = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "25-mrt", class = "factor"),
BT = structure(c(6L, 7L, 6L, 6L, 6L, 6L, 6L, 6L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 6L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 2L,
2L, 2L, 2L, 2L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 6L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 8L, 4L, 6L, 6L, 6L, 4L, 3L, 4L, 4L, 3L,
4L, 3L, 3L, 3L, 3L, 6L, 6L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 3L, 4L, 3L, 3L, 3L, 4L, 3L, 6L, 6L, 8L, 6L, 4L, 4L,
4L, 8L, 4L, 4L, 2L, 3L, 3L, 3L, 3L, 6L, 3L, 5L, 4L, 5L, 5L,
4L, 3L), .Label = c("16-apr", "17-apr", "18-apr", "19-apr",
"21-mei", "23-apr", "26-apr", "30-apr"), class = "factor"),
ff = structure(c(14L, 20L, 4L, 10L, 20L, 3L, 1L, 14L, 9L,
11L, 20L, 11L, 9L, 9L, 9L, 11L, 12L, 12L, 6L, 12L, 12L, 16L,
12L, 12L, 17L, 17L, 12L, 16L, 17L, 18L, 12L, 6L, 20L, 20L,
15L, 15L, 15L, 20L, 20L, 11L, 11L, 11L, 9L, 9L, 9L, 9L, 20L,
20L, 20L, 4L, 1L, 4L, 4L, 4L, 8L, 20L, 4L, 20L, 12L, 4L,
14L, 14L, 11L, 11L, 15L, 15L, 11L, 11L, 9L, 15L, 9L, 9L,
11L, 11L, 14L, 1L, 5L, 4L, 4L, 20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 15L, 15L, 14L, 13L, 15L, 15L, 11L, 9L, 9L,
11L, 9L, 11L, 1L, 20L, 1L, 20L, 20L, 20L, 20L, 1L, 1L, 4L,
20L, 20L, 20L, 15L, 15L, 14L, 15L, 1L, 15L, 15L, 20L, 11L,
11L, 11L, 11L, 15L, 10L, 10L, 16L, 10L, 12L, 10L, 17L, 8L,
16L, 12L, 8L, 4L, 4L, 8L, 20L, 10L, 1L, 20L, NA, 12L, 10L,
20L, 20L, 20L, 1L, 20L, 1L, 20L, 12L, 16L, 12L, 2L, 8L, 4L,
10L, 4L, 4L, 4L, 10L, 8L, 4L, 8L, 20L, 20L, 20L, NA, 20L,
1L, 20L, 1L, 8L, 20L, 1L, 1L, 7L, 17L, 19L, 19L, 12L, 10L,
12L, 19L, 10L, 10L, 10L, 17L), .Label = c("10-mei", "13-jun",
"14-apr", "14-mei", "17-mei", "18-jun", "21-jun", "21-mei",
"23-apr", "24-mei", "26-apr", "28-mei", "3-apr", "3-mei",
"30-apr", "31-mei", "4-jun", "5-jul", "7-jun", "7-mei"), class = "factor"),
FH = c(3.5, 6, 9, 16, 5.5, 12, 11.5, 4, 4.5, 6, 8, 5, 4.5,
3.5, 4, 5, 20, 42, 14, 40, 27, 42, 27, 26, 16, 18, 35, 17,
20, 28, 15, 20, 33, 32, 14.5, 14.5, 14.5, 35, 32, 12.5, 13.5,
12, 14.5, 12, 15, 14.5, 18, 18, 18.5, 35, 23, 25, 30, 37,
53, 27.5, 37, 25.5, 35, 47, 8.5, 20.5, 13, 14.5, 13.5, 18.5,
10.5, 10, 14.3, 18.5, 15.3, 11.7, 16, 15, 13.5, 26, 36, 30,
43, 23.5, 23.5, 31.5, 29, 30.5, 30, 29, 30, 24.5, 19, 23,
21.5, 26.5, 18.5, 20, 15, 12.3, 17, 12, 15, 13, 43614, 25,
27, 22.5, 35, 23.5, 30, 42, 42, 55, 32.5, 26, 26, 9.5, 4.5,
5.5, 5, 15.5, 10, 4.5, 8.5, 6, 5, 5.5, 5, 4.5, 30, 20, 16,
16, 20, 22, 30, 22, 25, 11, 13.5, 11, 11, 14, 6, NA, 5.5,
7, NA, 12, 14, 7, 9.5, 6.5, 9, 8.5, 12.5, 8, 27, 33, 35,
32, 17, 14, 22, 11, 17, 12, 25, 22, 15, 10, 5, 3, 4, NA,
5, 8, 4.5, 6, 7, 5, 5.5, 7, 42, 23, 23, 21, 14, 21, 17, 22,
19, 18, 17, 17), SRDT = structure(c(2L, 7L, 14L, NA, 7L,
8L, 7L, NA, NA, NA, 3L, NA, 18L, 15L, 17L, 17L, 18L, 18L,
NA, 18L, 15L, 17L, 15L, 20L, 2L, NA, 11L, 17L, 18L, 2L, 2L,
2L, 14L, 12L, 17L, 15L, 12L, 9L, 9L, 6L, 6L, 15L, 15L, 15L,
15L, NA, 17L, 15L, 10L, 11L, 11L, 10L, 11L, 17L, 5L, 21L,
6L, NA, 20L, 5L, 12L, 7L, NA, 17L, 17L, 15L, 15L, 10L, 10L,
6L, 10L, 10L, 21L, NA, 15L, 15L, 5L, 15L, 15L, 11L, 10L,
21L, 1L, 21L, 21L, 21L, 1L, 5L, 18L, 2L, 9L, 9L, NA, 12L,
10L, NA, 16L, 6L, 6L, 15L, 6L, 10L, 10L, 10L, 1L, 10L, 1L,
21L, 21L, 1L, 21L, 5L, 18L, 2L, 17L, 20L, 9L, 14L, 5L, 9L,
9L, 11L, NA, 18L, 10L, 18L, 20L, 4L, 9L, 7L, 2L, 2L, 7L,
5L, 17L, 17L, 11L, 10L, 12L, 2L, 14L, 19L, 19L, 19L, NA,
NA, 2L, 11L, 17L, 14L, 17L, 9L, 10L, 10L, 2L, 7L, 17L, 14L,
2L, 11L, 20L, 2L, 15L, 15L, 11L, 5L, NA, 10L, NA, 2L, 8L,
NA, NA, 14L, 5L, 15L, 15L, NA, 22L, NA, 9L, 9L, 19L, 9L,
9L, 22L, 20L, 13L, 7L, 20L, 15L, 20L), .Label = c("10-mei",
"11-jun", "13-jun", "13-mei", "14-mei", "17-mei", "18-jun",
"2-jul", "21-jun", "21-mei", "24-mei", "25-jun", "26-jun",
"28-jun", "28-mei", "3-mei", "31-mei", "4-jun", "5-jul",
"7-jun", "7-mei", "9-jul"), class = "factor"), MH = c(26,
50, 58, NA, 46, 58, 61, NA, NA, NA, 40, NA, 68, 48, 47, 42,
26, 50, NA, 48, 27, 42, 27, 48, 25, NA, 25, 17, 20, 18, 32,
19, 75, 75, 65, 70, 73, 73, 71, 65, 70, 60, 80, 70, 70, NA,
54, 45, 45, 45, 45, 40, 49, 53, 45, 27.5, 44, NA, NA, 47,
47, 62, NA, 75, 60, 75, 70, 65, 80, 67, 80, 75, 52, NA, 67,
68, 26, 55, 60, 60, 60, 31.5, 39, 30.5, 30, 29, 39, 39, 86,
74, 80, 76, NA, 69, 80, NA, 44, 70, 70, 65, 43, 60, 57, 57,
45, 60, 39, 35, 32.5, 27, 32.5, 43, 70, 75, 60, 66, 58, 48,
41, NA, 44, 42, NA, 44, 39, 40, 48, 53, 50, 50, 45, 45, 50,
13, 25, 11, 21, 20.5, 46, 44, 54, 25, 20, 25, NA, NA, 28,
33, 36, 40, 21, 36, 23.5, 21, 44, 60, 37, 37, 55, 24, 45,
45, 35, 30, 25, 12, 27, 10, NA, 53, 35, NA, NA, 43, 11, 13,
7, NA, 22, NA, 42, 46, NA, 41, 43, 40, 26, 45, 35, 29, 17,
22), SEEDT = structure(c(2L, 4L, 9L, NA, 4L, 5L, 4L, NA,
NA, NA, 4L, NA, 12L, 11L, 11L, 11L, 4L, 3L, NA, 4L, 15L,
4L, 8L, 5L, 7L, NA, 2L, 2L, 8L, 13L, 8L, NA, 13L, 8L, 15L,
15L, 8L, 7L, 7L, 10L, 10L, 11L, 6L, 10L, 10L, NA, 3L, 11L,
12L, 12L, 12L, 12L, 4L, 4L, 12L, 12L, 12L, NA, 9L, 12L, NA,
4L, NA, 2L, 15L, 2L, 15L, 14L, 10L, 12L, 12L, 11L, 11L, NA,
2L, 12L, 8L, 3L, 15L, 11L, 11L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 2L, 2L, 7L, 7L, NA, 8L, 10L, NA, 10L, 10L, 10L,
15L, 10L, 12L, 12L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 10L,
11L, 12L, 2L, 12L, 4L, 7L, 9L, 10L, 7L, 7L, 10L, NA, 12L,
10L, 15L, 2L, 4L, 8L, 8L, 4L, 4L, 13L, 12L, NA, NA, 4L, 7L,
NA, 7L, 13L, 13L, 13L, NA, NA, NA, 2L, 2L, NA, NA, NA, 8L,
NA, NA, 4L, 4L, 2L, NA, 4L, 2L, 7L, 7L, 7L, 2L, 2L, 15L,
1L, 15L, NA, 2L, 5L, NA, NA, 5L, 13L, NA, NA, NA, NA, NA,
16L, 16L, 13L, 16L, 7L, 1L, 7L, 16L, 7L, 7L, 7L, NA), .Label = c("11-jul",
"11-jun", "13-jun", "18-jun", "2-jul", "20-mei", "21-jun",
"25-jun", "28-jun", "28-mei", "31-mei", "4-jun", "5-jul",
"6-apr", "7-jun", "9-jul"), class = "factor"), FERMK = c(7L,
8L, 8L, 7L, 8L, 8L, 8L, 4L, NA, NA, 5L, 7L, 7L, 6L, 7L, 6L,
4L, 6L, NA, 4L, 3L, 4L, 4L, 4L, 2L, NA, 2L, 2L, 2L, 1L, 2L,
2L, 8L, 6L, 6L, 6L, 7L, 7L, 7L, 6L, 6L, 7L, 7L, 6L, 4L, 6L,
6L, 5L, 6L, 5L, 5L, 6L, 5L, 4L, 2L, 5L, NA, NA, 4L, 2L, 5L,
5L, NA, 7L, 7L, 8L, 6L, 6L, 7L, NA, 7L, 7L, 6L, 5L, 5L, 5L,
4L, 4L, 6L, 7L, 6L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 8L, 7L, 7L,
7L, 7L, 7L, 7L, NA, 7L, 7L, 7L, 7L, 5L, 5L, 4L, 5L, 6L, 4L,
6L, 2L, 2L, 2L, 5L, 4L, 7L, 6L, 8L, 7L, 6L, 6L, 8L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 5L, 5L, 4L, 4L, 4L, 4L, 2L, 2L, NA,
3L, 2L, NA, 3L, 6L, 5L, 5L, 6L, NA, 6L, 4L, 6L, 5L, 5L, 5L,
5L, 4L, 5L, 4L, 4L, 6L, 5L, 6L, 5L, 7L, 7L, 7L, 3L, 2L, 3L,
3L, 4L, NA, 5L, 5L, NA, 5L, 5L, 3L, 2L, 3L, NA, 4L, NA, 5L,
4L, 5L, 5L, 6L, 4L, 4L, 3L, 3L, 4L, 5L, NA), PLRMK = c(1L,
2L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA, 1L, 2L, 0L, 0L, 0L, 0L,
1L, 1L, NA, 1L, 1L, 2L, 1L, 1L, 4L, NA, 5L, 5L, 4L, 5L, 3L,
4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, NA,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 4L, 5L, NA, NA, 5L, 6L, 1L,
1L, NA, 1L, 1L, 0L, 1L, 1L, 1L, NA, 2L, 1L, 2L, NA, 2L, NA,
4L, 3L, 2L, 2L, 1L, 4L, 5L, 5L, 4L, 5L, 7L, 6L, 1L, 1L, 1L,
1L, NA, 1L, 2L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 4L, 5L, 2L,
4L, 7L, 5L, 8L, 5L, 2L, 0L, 1L, 1L, 1L, 7L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 0L, NA, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 6L, 6L, NA,
4L, 4L, NA, 2L, 2L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 1L, 1L, 1L,
1L, NA, NA, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 5L, 5L, 4L,
1L, 4L, NA, 2L, 1L, NA, NA, 2L, 2L, 0L, 0L, NA, 1L, NA, 4L,
2L, 1L, 2L, 1L, 2L, 4L, 1L, 2L, 4L, 3L, NA), FF = c(39L,
43L, 50L, 60L, 43L, 20L, 46L, 39L, 29L, 32L, 43L, 32L, 29L,
29L, 29L, 32L, 64L, 64L, 85L, 64L, 64L, 67L, 64L, 64L, 71L,
71L, 64L, 67L, 71L, 102L, 64L, 85L, 43L, 43L, 36L, 36L, 36L,
43L, 43L, 32L, 32L, 32L, 29L, 29L, 29L, 29L, 43L, 43L, 43L,
50L, 46L, 50L, 50L, 50L, 57L, 43L, 50L, 43L, 64L, 50L, 39L,
39L, 32L, 32L, 36L, 36L, 32L, 32L, 29L, 36L, 29L, 29L, 32L,
32L, 39L, 46L, 53L, 50L, 50L, 43L, 43L, 43L, 43L, 43L, 43L,
43L, 43L, 43L, 36L, 36L, 39L, 9L, 36L, 36L, 32L, 29L, 29L,
32L, 29L, 32L, 46L, 43L, 46L, 43L, 43L, 43L, 43L, 46L, 46L,
50L, 43L, 43L, 43L, 36L, 36L, 39L, 36L, 46L, 36L, 36L, 43L,
32L, 32L, 32L, 32L, 36L, 60L, 60L, 67L, 60L, 64L, 60L, 71L,
57L, 67L, 64L, 57L, 50L, 50L, 57L, 43L, 60L, 46L, 43L, NA,
64L, 60L, 43L, 43L, 43L, 46L, 43L, 46L, 43L, 64L, 67L, 64L,
80L, 57L, 50L, 60L, 50L, 50L, 50L, 60L, 57L, 50L, 57L, 43L,
43L, 43L, NA, 43L, 46L, 43L, 46L, 57L, 43L, 46L, 46L, 88L,
71L, 74L, 74L, 64L, 60L, 64L, 74L, 60L, 60L, 60L, 71L), MF = c(78L,
85L, 95L, NA, 85L, 99L, 85L, NA, NA, NA, 80L, NA, 71L, 64L,
67L, 67L, 71L, 71L, NA, 71L, 64L, 67L, 64L, 74L, 78L, NA,
60L, 67L, 71L, 78L, 78L, 78L, 95L, 92L, 67L, 64L, 92L, 88L,
88L, 53L, 53L, 64L, 64L, 64L, 64L, NA, 67L, 64L, 57L, 60L,
60L, 57L, 60L, 67L, 50L, 43L, 53L, NA, 74L, 50L, 92L, 85L,
NA, 67L, 67L, 64L, 64L, 57L, 57L, 53L, 57L, 57L, 43L, NA,
64L, 64L, 50L, 64L, 64L, 60L, 57L, 43L, 46L, 43L, 43L, 43L,
46L, 50L, 71L, 78L, 88L, 88L, NA, 92L, 57L, NA, 39L, 53L,
53L, 64L, 53L, 57L, 57L, 57L, 46L, 57L, 46L, 43L, 43L, 46L,
43L, 50L, 71L, 78L, 67L, 74L, 88L, 95L, 50L, 88L, 88L, 60L,
NA, 71L, 57L, 71L, 74L, 49L, 88L, 85L, 78L, 78L, 85L, 50L,
67L, 67L, 60L, 57L, 92L, 78L, 95L, 102L, 102L, 102L, NA,
NA, 78L, 60L, 67L, 95L, 67L, 88L, 57L, 57L, 78L, 85L, 67L,
95L, 78L, 60L, 74L, 78L, 64L, 64L, 60L, 50L, NA, 57L, NA,
78L, 99L, NA, NA, 95L, 50L, 64L, 64L, NA, 106L, NA, 88L,
88L, 102L, 88L, 88L, 106L, 74L, 93L, 85L, 74L, 64L, 74L),
speed = c(0.08974359, 0.139534884, 0.18, 0.266666667, 0.127906977,
0.6, 0.25, 0.102564103, 0.155172414, 0.1875, 0.186046512,
0.15625, 0.155172414, 0.120689655, 0.137931034, 0.15625,
0.3125, 0.65625, 0.164705882, 0.625, 0.421875, 0.626865672,
0.421875, 0.40625, 0.225352113, 0.253521127, 0.546875, 0.253731343,
0.281690141, 0.274509804, 0.234375, 0.235294118, 0.76744186,
0.744186047, 0.402777778, 0.402777778, 0.402777778, 0.813953488,
0.744186047, 0.390625, 0.421875, 0.375, 0.5, 0.413793103,
0.517241379, 0.5, 0.418604651, 0.418604651, 0.430232558,
0.7, 0.5, 0.5, 0.6, 0.74, 0.929824561, 0.639534884, 0.74,
0.593023256, 0.546875, 0.94, 0.217948718, 0.525641026, 0.40625,
0.453125, 0.375, 0.513888889, 0.328125, 0.3125, 0.493103448,
0.513888889, 0.527586207, 0.403448276, 0.5, 0.46875, 0.346153846,
0.565217391, 0.679245283, 0.6, 0.86, 0.546511628, 0.546511628,
0.73255814, 0.674418605, 0.709302326, 0.697674419, 0.674418605,
0.697674419, 0.569767442, 0.527777778, 0.638888889, 0.551282051,
2.944444444, 0.513888889, 0.555555556, 0.46875, 0.424137931,
0.586206897, 0.375, 0.517241379, 0.40625, 948.1304348, 0.581395349,
0.586956522, 0.523255814, 0.813953488, 0.546511628, 0.697674419,
0.913043478, 0.913043478, 1.1, 0.755813953, 0.604651163,
0.604651163, 0.263888889, 0.125, 0.141025641, 0.138888889,
0.336956522, 0.277777778, 0.125, 0.197674419, 0.1875, 0.15625,
0.171875, 0.15625, 0.125, 0.5, 0.333333333, 0.23880597, 0.266666667,
0.3125, 0.366666667, 0.422535211, 0.385964912, 0.373134328,
0.171875, 0.236842105, 0.22, 0.22, 0.245614035, 0.139534884,
NA, 0.119565217, 0.162790698, NA, 0.1875, 0.233333333, 0.162790698,
0.220930233, 0.151162791, 0.195652174, 0.197674419, 0.27173913,
0.186046512, 0.421875, 0.492537313, 0.546875, 0.4, 0.298245614,
0.28, 0.366666667, 0.22, 0.34, 0.24, 0.416666667, 0.385964912,
0.3, 0.175438596, 0.11627907, 0.069767442, 0.093023256, NA,
0.11627907, 0.173913043, 0.104651163, 0.130434783, 0.122807018,
0.11627907, 0.119565217, 0.152173913, 0.477272727, 0.323943662,
0.310810811, 0.283783784, 0.21875, 0.35, 0.265625, 0.297297297,
0.316666667, 0.3, 0.283333333, 0.23943662), ratiofm = c(7,
4, 8, 7, 8, 8, 8, NA, NA, NA, 5, 3.5, NA, NA, NA, NA, 4,
6, NA, 4, 3, 2, 4, 4, 0.5, NA, 0.4, 0.4, 0.5, 0.2, 0.666666667,
0.5, 8, 6, 6, 6, 7, 7, 7, 3, 3, 3.5, 3.5, 6, 4, NA, 3, 2.5,
3, 5, 2.5, 3, 5, 4, 0.5, 1, NA, NA, 0.8, 0.333333333, 5,
5, NA, 7, 7, NA, 6, 6, 7, NA, 3.5, 7, 3, NA, 2.5, NA, 1,
1.333333333, 3, 3.5, 6, 1, 0.4, 0.4, 0.5, 0.4, 0.285714286,
0.333333333, 8, 7, 7, 7, NA, 7, 3.5, NA, 7, 7, 7, 7, 1.666666667,
1.666666667, 4, 1.25, 1.2, 2, 1.5, 0.285714286, 0.4, 0.25,
1, 2, NA, 6, 8, 7, 0.857142857, 6, NA, 7, 7, NA, NA, NA,
NA, NA, 3.5, 1.666666667, 5, 4, 2, 4, 2, 0.333333333, 0.333333333,
NA, 0.75, 0.5, NA, 1.5, 3, 5, 5, 6, NA, 6, 4, 2, 5, 5, 5,
5, NA, NA, 4, 4, 3, 2.5, 3, 2.5, 2.333333333, 3.5, 3.5, 0.6,
0.4, 0.75, 3, 1, NA, 2.5, 5, NA, NA, 2.5, 1.5, NA, NA, NA,
4, NA, 1.25, 2, 5, 2.5, 6, 2, 1, 3, 1.5, 1, 1.666666667,
NA)), class = "data.frame", row.names = c(NA, -192L))
It would be more clear with pictures of my graphs, but apparently I'm not allowed yet to include pictures in my posts, sorry
Thanks in advance for your help
you can try
library(tidyverse)
df %>%
as_tibble() %>%
ggplot(aes(x=fusion, y=FF)) +
geom_boxplot(aes(colour=fusion))+
ggsignif::geom_signif(comparisons = combn(levels(df$fusion), 2, simplify = F), step_increase = 0.3) +
ggpubr::stat_compare_means() +
facet_wrap(~Genotype)+
xlab(" ")+
ylab("Days after sowing")

provide sample size for violin plot

Using the following code I made a violin plot for most of my variables, and added points where I didn't have sufficient information for some data. I'd like to add sample sizes to the right end of each violin, but I haven't been able to find a way to do this.
#dataset
str(threats)
'data.frame': 60 obs. of 3 variables:
$ threat : Factor w/ 7 levels "weather","competition",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Species : Factor w/ 5 levels "Bank","Barn",..: 1 1 1 1 1 1 1 1 1 1 ...
$ effect.abs : int 18 13 0 43 43 0 23 13 14 16 ...
#added to help 0 values with logarithmic axis scale
threats$effect.abs1<-threats$effect.abs+0.1
#subset of data with insufficient info for violin plot
#plotted with geom_dotplot
threats.sub<-subset(threats,
(threat=="competition") |
(threat=="disease" & Species =="Barn") |
(threat=="insect_availability") |
(threat=="weather" &
(Species=="Cliff" | Species=="Purple")) |
(threat=="incidental_loss") |
(threat=="predation" & Species=="Bank"))
ggplot() +
geom_dotplot(data=threats.sub, aes(x=Species, y=effect.abs1, fill=Species),
binaxis='y', stackdir='center', binwidth =.1) +
geom_violin(data=threats, aes(x=Species, y=effect.abs1, fill=Species)) +
coord_flip() +
facet_wrap(~threat, ncol=2, labeller = labeller(threat=facet.labels),
strip.position = "left") +
scale_y_log10(breaks=c(0.1,1,10,100), labels=c(0,1,10,100)) +
labs(x=("Threat"), y=("Absolute effect on adult survival (%)")) +
theme_bw() +
theme(axis.text=element_text(size=9, colour="black"),
axis.title=element_text(size=10, colour="black"),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid=element_blank(),
panel.border=element_rect(colour="black", size=1),
plot.margin=unit(c(.3,.3,.4,.4), "cm"),
strip.background=element_rect(fill=NA, colour=NA), #element_blank(),
legend.position="right")
My attempts to use the solution below (provided in other questions), only resulted in an error message.
give.n <- function(x){
return(c(y = mean(x), label = length(x)))
}
stat_summary(fun.data = give.n, geom = "text") #added to ggplot code above
Error in if (empty(data)) { : missing value where TRUE/FALSE needed
I would appreciate any help with this issue. I'd prefer to find a way for R to calculate the sample sizes (rather then me providing each one), as I also keep getting this following warning message when I produce this figure and I'd like to double-check that all the data is being displayed correctly.
Warning messages:
1: In max(data$density) : no non-missing arguments to max; returning -Inf
2: In max(data$density) : no non-missing arguments to max; returning -Inf
3: In max(data$density) : no non-missing arguments to max; returning -Inf
Thanks!
As requested:
structure(list(threat = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L), .Label = c("weather", "competition", "incidental_loss",
"contaminants", "insect_availability", "disease", "predation"
), class = "factor"),
Species = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 5L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L,
5L, 5L, 1L, 2L, 2L, 2L, 2L), .Label = c("Bank", "Barn", "Cliff",
"Tree", "Purple"), class = "factor"),
effect.abs = c(18L,
13L, 0L, 43L, 43L, 0L, 23L, 13L, 14L, 16L, 18L, 29L, 0L, 40L,
0L, 20L, 53L, 0L, 17L, 15L, 13L, 25L, 19L, 25L, 0L, 0L, 0L, 14L,
20L, 0L, 0L, 0L, 0L, 4L, 1L, 0L, 1L, 1L, 1L, 1L, 12L, 0L, 30L,
95L, 10L, 3L, 7L, 12L, 14L, 100L, 0L, 23L, 13L, 5L, 0L, 58L,
20L, 4L, 9L, 0L)), row.names = c(NA, -60L), class = "data.frame")
The way to tackle this is to precompute your n's
E.G.
summary_df <- df %>%
group_by(threat, Species, effect.abs1) %>%
summarise(n = n())
Then add it to your graph
+ geom_label(aes(x = 100, y = effect.abs1, label = n), data = summary_df)
Thanks for helpful comments from #Jack Brookes for getting me started on this. Here is my final solution for this issue.
#first summarize n's for all data
summary_df_all <- threats %>%
group_by(threat, Species) %>%
summarise(n = n(), maxE=max(effect.abs1))
#next summarize n's for the subset of data I'm not interested in getting the n's for
summary_df_sub <- threats.sub %>%
group_by(threat, Species) %>%
summarise(n = n(), maxE=max(effect.abs1)) %>%
mutate(probability = 0)
#combine these summaries, and filter out the points that will not be displayed
summary_df_violin <- left_join(summary_df_all, summary_df_sub,
by = c("threat", "Species")) %>%
mutate(probability = ifelse(is.na(probability), 1,
probability)) %>% filter(probability > 0)
#and plot
ggplot() +
geom_dotplot(data=threats.sub, aes(x=Species, y=effect.abs1, colour=Species, fill=Species),
binaxis='y', stackdir='center', binwidth =.09) +
geom_violin(data=threats, aes(x=Species, y=effect.abs1, colour=Species, fill=Species), size=1.1) +
#geom_label(aes(x=100, y=effect.abs1, label=n), data=summary_df)
geom_text(data=summary_df_violin, aes(y=maxE.x, x=Species, label=n.x), nudge_y=.2) +
coord_flip() +
facet_wrap(~threat, ncol=2, labeller = labeller(threat=facet.labels),
strip.position = "left") +
scale_y_log10(breaks=c(0.1,1,10,100), labels=c(0,1,10,100)) +
labs(x=("Threat"), y=("Absolute effect on adult survival (%)")) +
theme_bw() +
theme(axis.text=element_text(size=9, colour="black"),
axis.title=element_text(size=10, colour="black"),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid=element_blank(),
panel.border=element_rect(colour="black", size=1),
plot.margin=unit(c(.3,.3,.4,.4), "cm"),
strip.background=element_rect(fill=NA, colour=NA),
strip.text=element_text(size=9, colour="black"),
legend.position="right")

ggplot group by fill and show mean

I'm working on a heatmap and following along this tutorial:
https://www.r-graph-gallery.com/283-the-hourly-heatmap/
To save a click, here's the code block to reproduce:
library(ggplot2)
library(dplyr) # easier data wrangling
library(viridis) # colour blind friendly palette, works in B&W also
library(Interpol.T) # will generate a large dataset on initial load
library(lubridate) # for easy date manipulation
library(ggExtra) # because remembering ggplot theme options is beyond me
library(tidyr)
data<- data(Trentino_hourly_T,package = "Interpol.T")
names(h_d_t)[1:5]<- c("stationid","date","hour","temp","flag")
df<- tbl_df(h_d_t) %>%
filter(stationid =="T0001")
df<- df %>% mutate(year = year(date),
month = month(date, label=TRUE),
day = day(date))
df$date<-ymd(df$date) # not necessary for plot but
#useful if you want to do further work with the data
#cleanup
rm(list=c("h_d_t","mo_bias","Tn","Tx",
"Th_int_list","calibration_l",
"calibration_shape","Tm_list"))
#create plotting df
df <-df %>% select(stationid,day,hour,month,year,temp)
Then a heatmap is made:
p <-ggplot(df,aes(day,hour,fill=temp))+
geom_tile(color= "white",size=0.1) +
scale_fill_viridis(name="Hrly Temps C",option ="C")
p <-p + facet_grid(year~month)
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
So far so good, I can recreate this. However my own dataset is website visit data at the visit level, so many visits in a given day and hour. In addition to visits I also have a timeOnPage metric.
Sample of data below with dput.
I wouldlike to heatmap the average hourly visits or timeOnPage. Here's what I tried.
Sample of my data:
> dput(sam)
structure(list(Day = structure(c(4L, 4L, 4L, 5L, 3L, 2L, 3L,
6L, 2L, 2L, 4L, 2L, 3L, 3L, 6L, 1L, 4L, 2L, 3L, 5L, 2L, 5L, 4L,
2L, 5L, 2L, 7L, 5L, 6L, 2L, 2L, 6L, 4L, 6L, 2L, 2L, 2L, 5L, 5L,
2L, 6L, 5L, 3L, 5L, 3L, 2L, 6L, 4L, 2L, 5L, 2L, 5L, 4L, 2L, 6L,
2L, 7L, 2L, 2L, 2L, 5L, 6L, 3L, 2L, 3L, 4L, 4L, 3L, 6L, 2L, 5L,
3L, 4L, 4L, 3L, 2L, 5L, 5L, 5L, 3L, 5L, 2L, 4L, 5L, 5L, 2L, 3L,
6L, 2L, 2L, 5L, 4L, 6L, 7L, 3L, 3L, 4L, 4L, 2L, 6L), .Label = c("Sun",
"Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"), class = c("ordered",
"factor")), Hour = c(18L, 7L, 3L, 22L, 11L, 11L, 9L, 16L, 16L,
13L, 18L, 18L, 10L, 19L, 7L, 13L, 18L, 14L, 10L, 20L, 17L, 6L,
21L, 15L, 18L, 7L, 12L, 10L, 16L, 14L, 18L, 13L, 17L, 10L, 19L,
20L, 14L, 16L, 10L, 9L, 16L, 9L, 8L, 13L, 17L, 17L, 11L, 15L,
22L, 17L, 18L, 17L, 7L, 19L, 12L, 2L, 12L, 15L, 7L, 17L, 17L,
18L, 13L, 10L, 19L, 9L, 13L, 13L, 17L, 21L, 23L, 4L, 17L, 12L,
12L, 9L, 17L, 19L, 7L, 4L, 5L, 17L, 6L, 23L, 3L, 14L, 19L, 13L,
7L, 11L, 9L, 13L, 9L, 19L, 11L, 5L, 20L, 20L, 19L, 11L), sessionID = c("1508980591045.l027p6mt",
"1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.xasqfwqd10v1qdtl6jemi",
"1510082622485.szj2ja1e", "1511204933263.mq9bvi0d", "1511285142249.vp2fyfd9",
"1510965282725.x04h1dko", "1508801295434.e056cpef", "1508790369346.ly63bjgr",
"1509585154520.3usd036k", "1511834881064.e6f5evp", "1509471114265.2u807dwo",
"1507688054076.9dls0jk", "1509721031589.ho125mpb", "1510521845178.99j1ibkr",
"1510194555297.ioepfjgr", "1508793469455.hkc3xwa8", "1511288175700.62n5oc5",
"1510287319653.7ye9sjc", "1511227016523.yyn1of99", "1511448209341.1u5vir5p",
"1510205972493.qvu4ev7o", "1510615247987.swxhwct", "1508463701266.p52sdjzp",
"1510588449881.d6ffruv9", "1507404213416.rovwmmge", "1510857718956.2z57w2vr",
"1510360661780.19hznp3m78pvi", "1511820500742.48cyvo2a", "1508809029952.up0wqq5h",
"1508533120441.gdvhacjr7jswiquwuyp66r", "1509583258224.j8krac0sz5kx8pxohl4n29",
"1511549442901.5vm7na1l", "1508811367845.7b36epqk", "1509421407861.om0ydylt",
"1508794534361.p3gcoa0e", "1510877729807.viad220f", "1511460355269.omwvd00l",
"1508775703610.usuk2akm", "1510964376869.7e2crw9d", "1510247098808.np9ia23",
"1508860753512.3z4182b", "1510868797935.3nmpvkri", "1510105270807.4evhpys",
"1511831565084.27izf13f", "1510340973580.l9qj5drou5wmi", "1508364715184.14l4ikj",
"1509426566404.9qnp0m3", "1510275972333.hhqu0exc", "1510625679744.jk3vvt1v",
"1510881839700.c34skful", "1511365134270.57thqyir", "1509416741055.1f2cnmrp",
"1509738404263.8ajwpij", "1510570338116.h9a5j88", "1511640706961.qw8q1eh",
"1510011913201.eqd54kw", "1508769010911.wrpb329", "1508803518777.56b2ej2l",
"1509670743316.yhncp17j", "1511576965410.y47g0wgj", "1508876390209.wem8i3lh",
"1508779846415.hyx8qar", "1511322782502.s835px9", "1509554323957.osxgi0em",
"1510176829762.jncm9xwb", "1509482328620.sqdbob0u", "1508545652936.a5hqcmp1fw29",
"1508817816447.6mbdldxb", "1510297785623.33i6yhko", "1508843299131.3m26sqf5",
"1510191633431.cl5fh9ik", "1509565114633.bd5yrkf5", "1510690660714.818yxn5o",
"1507567660773.ybpbfgn", "1509667501973.1a9f9pyp", "1509674601865.yqvmcclv",
"1511450423709.s149r25q", "1511267096892.n5u1d0nv", "1509624499459.u57lgtt8",
"1510019204298.ka4w9kfh", "1511362131909.t26h6ig", "1510904968660.eowoea2q",
"1510225256391.4dk073ej", "1510006654569.reo2eili", "1509501692686.ng48bwnz",
"1509741958143.bxbf325r", "1508770633217.33ymrfgc", "1511810438817.zcgpr6vj",
"1510852180447.wywsj7f", "1510176833767.nev0iaec", "1509727547082.53van2sr",
"1507430914148.niu297m", "1508868705810.akd7r18h", "1510060231388.mz9ojf6g",
"1509592760232.qtrlxye8", "1509592651211.1r82ucw4", "1508812928318.f3st4004",
"1509734102140.leol1dnw"), uniquePageviews = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), timeOnPage = c(359, 149, 69, 146, 147, 119, 168, 69, 29, 0,
1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, 0, 1141, 150,
236, 74, 128, 23, 147, 172, 223, 225, 88, 69, 156, 0, 49, 110,
150, 70, 123, 30, 145, 1629, 1, 119, 169, 48, 136, 529, 130,
149, 124, 281, 2483, 0, 60, 149, 50, 29, 124, 149, 0, 92, 149,
915, 47, 50, 89, 143, 84, 129, 147, 138, 80, 33, 226, 70, 146,
177, 98, 150, 32, 148, 149, 12, 338, 146, 204, 149, 148, 26,
149, 1110, 148, 23, 151, 0, 100, 0, 28)), row.names = c(20219L,
42612L, 42149L, 46707L, 40122L, 57449L, 60878L, 56707L, 11725L,
10102L, 29911L, 71743L, 25952L, 1492L, 35570L, 48411L, 43917L,
10530L, 61004L, 46446L, 58846L, 65695L, 44287L, 49341L, 2999L,
48502L, 627L, 54118L, 48148L, 70166L, 13346L, 4770L, 29745L,
67979L, 13832L, 24814L, 10692L, 54744L, 65995L, 8216L, 56683L,
44920L, 18121L, 54499L, 41155L, 71353L, 47606L, 1900L, 25023L,
45811L, 49937L, 54904L, 63607L, 24571L, 36060L, 48479L, 69086L,
37708L, 7353L, 12117L, 33912L, 68752L, 19081L, 8768L, 62647L,
28317L, 43172L, 26286L, 6359L, 14907L, 46733L, 16418L, 43797L,
28637L, 51671L, 1273L, 33677L, 34226L, 65759L, 60247L, 31739L,
38171L, 63497L, 55589L, 44462L, 37454L, 27141L, 36178L, 7543L,
69636L, 54030L, 43173L, 35743L, 852L, 18784L, 39283L, 30672L,
30663L, 14142L, 35933L), class = "data.frame", .Names = c("Day",
"Hour", "sessionID", "uniquePageviews", "timeOnPage"))
It looks like this:
> head(sam)
Day Hour sessionID uniquePageviews timeOnPage
20219 Wed 18 1508980591045.l027p6mt 1 359
42612 Wed 7 1510155616668.57i2wj1 1 149
42149 Wed 3 1510140439620.qu19kyo 1 69
46707 Thurs 22 1510296404412.xasqfwqd10v1qdtl6jemi 1 146
40122 Tues 11 1510082622485.szj2ja1e 1 147
57449 Mon 11 1511204933263.mq9bvi0d 1 119
> glimpse(sam)
Observations: 100
Variables: 5
$ Day <ord> Wed, Wed, Wed, Thurs, Tues, Mon, Tues, Fri, Mon, Mon, Wed, Mon, Tues, Tues, Fri, Sun, Wed, M...
$ Hour <int> 18, 7, 3, 22, 11, 11, 9, 16, 16, 13, 18, 18, 10, 19, 7, 13, 18, 14, 10, 20, 17, 6, 21, 15, 1...
$ sessionID <chr> "1508980591045.l027p6mt", "1510155616668.57i2wj1", "1510140439620.qu19kyo", "1510296404412.x...
$ uniquePageviews <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ timeOnPage <dbl> 359, 149, 69, 146, 147, 119, 168, 69, 29, 0, 1542, 148, 242, 49, 457, 175, 175, 97, 79, 12, ...
Metric uniquePageviews will always be 1 or o and in a heatmap it doesn't look great. Since it's session level data there are multiple entries for each day / hour. For timeOnPage I wouldlike to heatmap the mean time on page for a given hour and day of week combination.
So, as far as I can tell ggplot is summing everything whereas I want mean().
My initial code block:
# creates the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I tried changing it to this but the results look the exact same:
# gets the initial heatmap
p <- ggplot(sam, aes(x = Day, y = Hour, fill = uniquePageviews),
stat = "summary", fun.y = "mean") +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "Mean TimeOnPage", option ="C")
# order by hour of day going top to bottom asc
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$hour))
I could do some dplyr group by transformations on the dataframe sam but I was not sure if ggplot::geom_tile() takes care of that or not?
How can I create a heatmap with ggplot where the fill is based on mean? Also, can someone clarify what exactly it's showing now? Total sum?
Not sure if I get your problem but you can try following:
library(tidyverse)
library(viridis)
d %>%
group_by(Day, Hour) %>%
summarise(Mean=mean(timeOnPage)) %>%
ggplot(aes(x = Day, y = Hour, fill = Mean)) +
geom_tile(color = "white", size = 0.1) +
scale_fill_viridis(name = "TimeOnPage", option ="C")
this will caclulate the mean timeOnPage per Day and Hour and plot it as a heatmap.

How do I display percentage in geom_line in ggplot2?

I am trying to display percentages in ggplot2 using geom_line and geom_point.
My code is:
print(ggplot(data=dfComb, aes(x=hour_adj, y=(..count..)/sum(..count..), group=word)) +
geom_line(aes(colour=dfComb$word)) +
geom_point(aes(colour=dfComb$word))
+ ggtitle(paste("Hourly Frequencies of Tweets")) +
xlab("Hour of Day") +
ylab("Count") +
scale_colour_discrete("Word", breaks=c("A","B"), labels=c("yid", "abbo")) +
scale_y_continuous(labels = scales::percent)
)
This errors:
Error in FUN(X[[i]], ...) : object 'count' not found
because the ..count.. variable is only created by geom_histogram (I think!) and not geom_line.
Is there an easy way to use percentages with geom_line?
FYI: EDIT, my data is:
dput(dfComb)
structure(list(hour_adj = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L,
22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L), count = c(44,
24, 22, 36, 26, 18, 39, 35, 50, 46, 46, 41, 57, 49, 34, 56, 54,
54, 49, 45, 36, 49, 43, 47, 35, 20, 18, 10, 10, 25, 25, 26, 32,
25, 29, 39, 37, 45, 52, 43, 46, 67, 38, 69, 108, 80, 73, 48),
word = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor")), .Names = c("hour_adj",
"count", "word"), row.names = c(NA, -48L), class = "data.frame")
You can calculate percentage in the dataframe first.
Also, as per Roman Lustrik's comment, it's better to call variable by name from within aes().
library(dplyr)
# sample data
set.seed(1)
dfComb <- data.frame(hour_adj = rep(0:4, 2),
count = sample(10:50, 10, replace = T),
word = c(rep("A", 5), rep("B", 5)))
ggplot(dfComb %>%
group_by(word) %>%
mutate(perc = count/sum(count)) %>%
ungroup(),
aes(x=hour_adj, y=perc, group=word, colour = word)) +
geom_line() +
geom_point() +
ggtitle(paste("Hourly Frequencies of Tweets")) +
xlab("Hour of Day") +
ylab("Count") +
scale_colour_discrete("Word", breaks=c("A","B"), labels=c("yid", "abbo")) +
scale_y_continuous(labels = scales::percent)

Resources