Related
I have to do a ggplot barplot with errorbars, Tukey sig. letters for plants grown with different fertilizer concentraitions.
The data should be grouped after the dif. concentrations and the sig. letters should be added automaticaly.
I have already a code for the same problem but for Boxplot - which is working nicely. I tried several tutorials with barplots but I always get the problem; stat_count() can only have an x or y aesthetic.
So I thought, is it possible to get my boxplot code to a barplot code? I tried but I couldnt do it :) And if not - how do I automatically add tukeyHSD Test result sig. letters to a ggplot barplot?
This is my Code for the boxplot with the tukey letters:
value_max = Dünger, group_by(Duenger.g), summarize(max_value = max(Höhe.cm))
hsd=HSD.test(aov(Höhe.cm~Duenger.g, data=Dünger),
trt = "Duenger.g", group = T) sig.letters <- hsd$groups[order(row.names(hsd$groups)), ]
J <- ggplot(Dünger, aes(x = Duenger.g, y = Höhe.cm))+ geom_boxplot(aes(fill= Duenger.g))+ scale_fill_discrete(labels=c("0.5g", '1g', "2g", "3g", "4g"))+ geom_text(data = value_max, aes(x=Duenger.g, y = 0.1 + max_value, label = sig.letters$groups), vjust=0)+ stat_boxplot(geom = 'errorbar', width = 0.1)+ ggtitle("Auswirkung von Dünger auf die Höhe von Pflanzen") + xlab("Dünger in g") + ylab("Höhe in cm"); J
This is how it looks:
boxplot with tukey
Data from dput:
structure(list(Duenger.g = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
0.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4), plant = c(1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 18, 19,
21, 23, 24, 25, 26, 27, 29, 30, 31, 33, 34, 35, 37, 38, 39, 40,
41, 42, 43, 44, 48, 49, 50, 53, 54, 55, 56, 57, 58, 61, 62, 64,
65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 79, 80, 81, 83, 85, 86,
88, 89, 91, 93, 99, 100, 102, 103, 104, 105, 106, 107, 108, 110,
111, 112, 113, 114, 115, 116, 117, 118, 120, 122, 123, 125, 126,
127, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, 143, 144,
145, 146, 147, 149), height.cm = c(5.7, 2.8, 5.5, 8, 3.5, 2.5,
4, 6, 10, 4.5, 7, 8.3, 11, 7, 8, 2.5, 7.4, 3, 14.5, 7, 12, 7.5,
30.5, 27, 6.5, 19, 10.4, 12.7, 27.3, 11, 11, 10.5, 10.5, 13,
53, 12.5, 12, 6, 12, 35, 8, 16, 56, 63, 69, 62, 98, 65, 77, 32,
85, 75, 33.7, 75, 55, 38.8, 39, 46, 35, 59, 44, 31.5, 49, 34,
52, 37, 43, 38, 28, 14, 28, 19, 20, 23, 17.5, 32, 16, 17, 24.7,
34, 50, 12, 14, 21, 33, 39.3, 41, 29, 35, 48, 40, 65, 35, 10,
26, 34, 41, 32, 38, 23.5, 22.2, 20.5, 29, 34, 45)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -105L))
Thank you
mirai
A bar chart and a boxplot are two different things. By default geom_boxplot computes the boxplot stats by default (stat="boxplot"). In contrast when you use geom_bar it will by default count the number of observations (stat="count") which are then mapped on y. That's the reason why you get an error. Hence, simply replacing geom_boxplot by geom_bar will not give your your desired result. Instead you could use e.g. stat_summary to create your bar chart with errorbars. Additionally I created a summary dataset to add the labels on the top of the error bars.
library(ggplot2)
library(dplyr)
library(agricolae)
Dünger <- Dünger |>
rename("Höhe.cm" = height.cm) |>
mutate(Duenger.g = factor(Duenger.g))
hsd <- HSD.test(aov(Höhe.cm ~ Duenger.g, data = Dünger), trt = "Duenger.g", group = T)
sig.letters <- hsd$groups %>% mutate(Duenger.g = row.names(.))
duenger_sum <- Dünger |>
group_by(Duenger.g) |>
summarize(mean_se(Höhe.cm)) |>
left_join(sig.letters, by = "Duenger.g")
ggplot(Dünger, aes(x = Duenger.g, y = Höhe.cm, fill = Duenger.g)) +
stat_summary(geom = "bar", fun = "mean") +
stat_summary(geom = "errorbar", width = .1) +
scale_fill_discrete(labels = c("0.5g", "1g", "2g", "3g", "4g")) +
geom_text(data = duenger_sum, aes(y = ymax, label = groups), vjust = 0, nudge_y = 1) +
labs(
title = "Auswirkung von Dünger auf die Höhe von Pflanzen",
x = "Dünger in g", y = "Höhe in cm"
)
#> No summary function supplied, defaulting to `mean_se()`
But as the summary dataset now already contains the mean and the values for the error bars a second option would be to do:
ggplot(duenger_sum, aes(x = Duenger.g, y = y, fill = Duenger.g)) +
geom_col() +
geom_errorbar(aes(ymin = ymin, ymax = ymax), width = .1) +
scale_fill_discrete(labels = c("0.5g", "1g", "2g", "3g", "4g")) +
geom_text(aes(y = ymax, label = groups), vjust = 0, nudge_y = 1) +
labs(
title = "Auswirkung von Dünger auf die Höhe von Pflanzen",
x = "Dünger in g", y = "Höhe in cm"
)
I want to annotate my boxplot (create in Base R) with some text and latex formula's, I tried with $..formula..$, but didn't work. Does anyone know a solution?
i = c(1:20)
X = c(13, 18, 25, 58, 25, 31, 39, 42, 17, 35, 46, 22, 18, 20, 26, 14, 33, 19, 20, 21)
df = data.frame(i, X)
boxplot(df$X, data=df, main="Belminuten Data",
xlab=" ", ylab="Aantal Belminuten",
frame = FALSE,
ylimit = c(10, 60),
range=3)
text(x = c(1.3), y = 60, "n = 20") # n should be in italic or in formula style
text(x = c(.7), y = 23.5, "Med = 23.5")
text(x = c(.7), y = 18.5, "Q_1 = 18.5")
library(latex2exp)
i = c(1:20)
X = c(13, 18, 25, 58, 25, 31, 39, 42, 17, 35, 46, 22, 18, 20, 26, 14,
33, 19, 20, 21)
df = data.frame(i, X)
boxplot(df$X, data=df, main="Belminuten Data",
xlab=" ", ylab="Aantal Belminuten",
frame = FALSE,
ylimit = c(10, 60),
range=3)
text(x = c(1.3), y = 60, TeX('$n = 20$'))
text(x = c(.7), y = 13.0, TeX('$Min = 13$'))
text(x = c(.7), y = 18.5, TeX('$Q_1 = 18.5$'))
text(x = c(.7), y = 23.5, TeX('$Med = 23.5$'))
text(x = c(.7), y = 34.0, TeX('$Q_3 = 34$'))
text(x = c(.7), y = 58.0, TeX('$Max = 58$'))
This is my first time asking a question, so I apologize for any formatting issues or anything that makes this difficult to answer. Please let me know what I need to add to be able to the answer question.
I'm attempting to compare differences between 2 unequal group sizes (one ~ 97 the other ~ 714). The reason for the large discrepancy is I am looking at a program done by one class to see if it is significantly different than what has occurred in previous classes. I've been reading about robust stats recently and decided to use a yuen bootstrap in R-Studio from the WRS2 package for a more valid comparison, especially with the difference in sample size.
My formula is
yuenbt(DataExample$PT500 ~ DataExample3$ClassPT500, tr = 0.2, nboot = 599, side = TRUE)
and it returns
Call:
yuenbt(formula = DataExample$PT500 ~ DataExample$ClassPT500,
tr = 0.2, nboot = 599, side = TRUE)
Test statistic: NA (df = NA), p-value = 0
Trimmed mean difference: -65
95 percent confidence interval:
NA NA
The NA's return on other variables that I've tried out as well, or in some cases the confidence interval will state INF. Any ideas why this is happening (such a big difference in sample size?) and suggestions on what the next best step would be are greatly appreciated.
Here is a sample of data:
structure(list(PrePT500 = c(74, 105, 121, 128), PostPT500 = c(191,
264, 327, 314), PT500 = c(117, 159, 206, 186), PrePullups = c(0,
NA, NA, 2), PostPullups = c(3, NA, NA, 3), Pullups = c(3, NA,
NA, 1), PreSitups = c(46, 40, 25, 33), PostSitups = c(41, 61,
39, 49), Situps = c(-5, 21, 14, 16), PreMC = c(8, 16, 29, 19),
PostMC = c(41, 45, 60, 60), MC = c(33, 29, 31, 41), PrePushups = c(20,
16, 28, 30), PostPushups = c(40, 47, 50, 50), Pushups = c(20,
31, 22, 20), Pre1.5 = c(1048, 917, 902, 905), Post1.5 = c(846,
748, 696, 760), X1.5 = c(-202, -169, -206, -145), Pre220 = c(43,
50, 41, 45), Post220 = c(39, 40, 32, 34), X220 = c(-4, -10,
-9, -11), PreAgility = c(20.96, NA, 21.1, 19.88), PostAgility = c(19.69,
NA, 18.8, 20.79), Agility = c(-1.27, NA, -2.3, 0.91), PreBD = c(6.17,
7.82, 5.08, 7), PostBD = c(5, 4.87, 4.68, 6.2), BD = c(-1.17,
-2.95, -0.4, -0.8), PreCL = c(7.05, 13.6, 14.4, 8.8), PostCL = c(8.1,
8.9, 8.27, 7.6), CL = c(1.05, -4.7, -6.13, -1.2), PreSW = c(10.2,
NA, 20.34, 8), PostSW = c(11.4, NA, 9.3, 7.4), SW = c(1.2,
NA, -11.04, -0.6), Pre500 = c(115, 128, 107, 114), Post500 = c(105,
112, 93, 99), X500 = c(-10, -16, -14, -15), PreTotal = c(446,
91, 255, NA), PostTotal = c(493, 439, 503, NA), Total = c(47,
348, 248, NA), ClassPrePT500 = c(338, 213, 215, 243), ClassPostPT500 = c(430,
396, 333, 314), ClassPT500 = c(92, 183, 118, 71), ClassPrePullups = c(6,
5, 2, 0), ClassPostPullups = c(13, 7, 15, 0), ClassPullups = c(7,
2, 13, 0), ClassPreSitups = c(59, 42, 45, 53), ClassPostSitups = c(75,
70, 51, 53), ClassSitups = c(16, 28, 6, 0), ClassPreMC = c(60,
43, 31, 48), ClassPostMC = c(60, 60, 31, 60), ClassMC = c(0,
17, 0, 12), ClassPrePushups = c(50, 37, 26, 30), ClassPostPushups = c(50,
50, 47, 34), ClassPushups = c(0, 13, 21, 4), ClassPre1.5 = c(803,
810, 803, 741), ClassPost1.5 = c(700, 690, 664, 661), Class1.5 = c(-103,
-120, -139, -80), ClassPre220 = c(32, 41, 31, 40), ClassPost220 = c(31,
33, 30, 37), Class220 = c(-1, -8, -1, -3), ClassPreAgility = c(19,
23, 18, 22.1), ClassPostAgility = c(16.4, 18, 16.5, 20.3),
ClassAgility = c(-2.6, -5, -1.5, -1.8), ClassPreBD = c(6.4,
8.5, 5.8, 11.2), ClassPostBD = c(5.3, 5.8, 5.5, 7.5), ClassBD = c(-1.1,
-2.7, -0.3, -3.7), ClassPreCL = c(7.8, 9.3, 7.3, 9.6), ClassPostCL = c(7.6,
7.4, 7.4, 9.2), ClassCL = c(-0.2, -1.9, 0.100000000000001,
-0.4), ClassPreSW = c(8.5, 8.4, 7.7, NA), ClassPostSW = c(7.8,
8.1, 7.6, 8), ClassSW = c(-0.7, -0.300000000000001, -0.100000000000001,
NA), ClassPre500 = c(102, 104, 100, 108), ClassPost500 = c(94,
88, 98, 101), Class500 = c(-8, -16, -2, -7), ClassPreTotal = c(495,
418, 528, 264), ClassPostTotal = c(561, 539, 562, 482), ClassTotal = c(66,
121, 34, 218)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
Thank you in advance for any help.
The R function
yuenbt(x, y, tr=0.2, alpha=0.05, nboot=599, side=F) computes a 1 − α confidence interval for μt 1 − μt 2 using the bootstrap-t method, where the default amount of trimming (tr) is 0.2, the default value for α is 0.05, and the default value
for nboot (B) is 599. So far, simulations suggest that in terms of probability coverage, there is little or no advantage to using B > 599 when α = 0.05. However, there is no recommended choice for B when α < 0.05 simply because little is known about how the bootstrap-t performs for this special case. Finally, the default value for side is FALSE, indicating that the equal-tailed two-sided confidence interval is to be used. Using side=TRUE results in the symmetric two-sided confidence interval.
Try:
yuenbt(DataExample$PT500, DataExample3$ClassPT500, tr = 0.2, nboot = 599, side = TRUE)
My imported data set consists of predetermined ranges and their probability density values. I have plotted this in a bar chart in R. So my plot shows a histogram, but to R its just a bar plot. However, I now need to put a curve on this bar chart for visualization purposes, using same data in bar chart.
The code I have used so far is creating a funny looking curve that doesn't fit appropriately to the bar chart...Any help would be hugely appreciated please!
Code used so far:
barplot(Data10$pdf, names = Data10$ï..Weight.Range, xlab = "Weight", ylab = "Probability Density", ylim = c(0.00,0.05), main = "Histogram")
fit1<-smooth.spline(Data10$ï..Weight.Range, Data10$pdf, df=12, spar = 0.2)
lines(fit1,col="blue", lwd=3)
Link to output of this code:
Data:
Data10 <- structure(list(
ï..Weight.Range = c(0, 0.5, 1, 1.5, 2, 2.5, 3,
3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 10.5,
11, 11.5, 12, 12.5, 13, 13.5, 14, 14.5, 15, 15.5, 16, 16.5, 17,
17.5, 18, 18.5, 19, 19.5, 20, 20.5, 21, 21.5, 22, 22.5, 23, 23.5,
24, 24.5, 25, 25.5, 26, 26.5, 27, 27.5, 28, 28.5, 29, 29.5, 30,
30.5, 31, 31.5, 32, 32.5, 33, 33.5, 34, 34.5, 35, 35.5, 36, 36.5,
37, 37.5, 38, 38.5, 39, 39.5, 40, 40.5, 41, 41.5, 42, 42.5, 43,
43.5, 44, 44.5, 45, 45.5, 46, 46.5, 47, 47.5, 48), pdf = c(0.012697609,
0.015237131, 0.017776653, 0.019046414, 0.020694512, 0.022575831,
0.024457151, 0.02633847, 0.028219789, 0.030101109, 0.031982428,
0.033863747, 0.035745066, 0.037626386, 0.039507705, 0.041389024,
0.043270343, 0.045151663, 0.042420729, 0.03688759, 0.033198831,
0.029510072, 0.026374627, 0.023976934, 0.02264407, 0.021614794,
0.020585518, 0.019556242, 0.018526967, 0.017497691, 0.016468415,
0.015439139, 0.014409863, 0.013380587, 0.012351311, 0.011322035,
0.009839476, 0.008433837, 0.007731017, 0.007028197, 0.005622558,
0.004919738, 0.004568328, 0.004498046, 0.004427764, 0.004357482,
0.0042872, 0.004216918, 0.004146636, 0.004076354, 0.004006072,
0.00393579, 0.003865508, 0.003795226, 0.003724944, 0.003654663,
0.003584381, 0.003514099, 0.003443817, 0.003373535, 0.003303253,
0.003232971, 0.003162689, 0.003092407, 0.003022125, 0.002951843,
0.002881561, 0.002811279, 0.002740997, 0.002670715, 0.002600433,
0.002530151, 0.002459869, 0.002389587, 0.002319305, 0.002249023,
0.002178741, 0.002108459, 0.002038177, 0.001967895, 0.001897613,
0.001827331, 0.001757049, 0.001686767, 0.001616485, 0.001546203,
0.001475921, 0.001405639, 0.001335357, 0.001265075, 0.001194794,
0.001124512, 0.00105423, 0.000983948, 0.000913666, 0.000843384,
0.000773102)
), class = "data.frame", row.names = c(NA, -97L))
You need to feed in the initial barplot when drawing the new lines.
my_bar <- barplot(Data10$pdf, names = Data10$ï..Weight.Range, xlab = "Weight", ylab = "Probability Density", ylim = c(0.00,0.05), main = "Histogram")
fit1<-smooth.spline(Data10$ï..Weight.Range, Data10$pdf, df=12, spar = .2)
lines(my_bar, fit1$y,col="blue",type="l",lwd=3)
The barplot function is meant to be used with a categorical variable. It is treating your x values as categories rather than a continuous number. When barplot runs, it calculates an value for each category which it silently returns. You can use those returned values with the result from your smooth spline to draw the line. For example
xx <- barplot(Data10$pdf, names = Data10$ï..Weight.Range, xlab = "Weight", ylab = "Probability Density", ylim = c(0.00,0.05), main = "Histogram")
fit1<-smooth.spline(Data10$ï..Weight.Range, Data10$pdf, df=12, spar = 0.2)
lines(xx[,1], fit1$y,col="blue", lwd=3)
I'm using the add_trace() function in a for loop to create lines for a 3d network graph in plotly's scatter3d mode. Each add_trace draws an individual line between two nodes in the network. The method is working, but with large number of loops, the speed of the individual loops seems to be slowing down very quickly.
Example data can be downloaded here: https://gist.github.com/pravj/9168fe52823c1702a07b
library(igraph)
library(plotly)
G <- read.graph("karate.gml", format = c("gml"))
L <- layout.circle(G)
vs <- V(G)
es <- as.data.frame(get.edgelist(G))
Nv <- length(vs)
Ne <- length(es[1]$V1)
Xn <- L[,1]
Yn <- L[,2]
network <- plot_ly(type = "scatter3d", x = Xn, y = Yn, z = rep(0, Ne), mode = "markers", text = vs$label, hoverinfo = "text", showlegend = F)
for(i in 1:Ne) {
v0 <- es[i,]$V1
v1 <- es[i,]$V2
x0 <- Xn[v0]
y0 <- Yn[v0]
x1 <- Xn[v1]
y1 <- Yn[v1]
df <- data.frame(x = c(x0, x1), y = c(y0, y1), z = c(0, 0))
network <- add_trace(network, data = df, x = x, y = y, z = z, type = "scatter3d", mode = "lines", showlegend = F,
marker = list(color = '#030303'), line = list(width = 0.5))
}
This example is fairly quick, but when I include a few hundred edges or more, the execution of the individual loops start to slow down radically. I tried different optimization methods (vectorisation etc), but there seems to be no working around the slowness of the add_trace function itself.
Any suggestions?
The most efficient way to add many line segments in plotly is not as a separate trace each, but to use only a single trace that contains all the line segments. You can do this by constructing a data frame with the x,y coordinates of each node to be connected, interspersed with NA's between each line segment. Then use connectgaps=FALSE to break the trace into separate segments at each NA. You can see another example of this approach, applied to spaghetti plots in this answer.
es$breaks <- NA
lines <- data.frame(node=as.vector(t(es)), x=NA, y=NA, z=0)
lines[which(!is.na(lines$node)),]$x <- Xn[lines[which(!is.na(lines$node)),]$node]
lines[which(!is.na(lines$node)),]$y <- Yn[lines[which(!is.na(lines$node)),]$node]
network <- plot_ly(type = "scatter3d", x = Xn, y = Yn, z = rep(0, Ne),
mode = "markers", text = vs$label, hoverinfo = "text",
showlegend = F) %>%
add_trace(data=lines, x=x, y=y, z=z, showlegend = FALSE,
type = 'scatter3d', mode = 'lines+markers',
marker = list(color = '#030303'), line = list(width = 0.5),
connectgaps=FALSE)
Reproducible data for this question
For convenience, here are the data for this question. The OP required downloading a .gml file from github, and installing library(igraph) to process the data into these.
es <- structure(list(
V1 = c(1, 1, 2, 1, 2, 3, 1, 1, 1, 5, 6, 1, 2, 3, 4, 1, 3, 3, 1, 5, 6, 1, 1, 4, 1, 2, 3, 4, 6, 7, 1, 2, 1, 2,
1, 2, 24, 25, 3, 24, 25, 3, 24, 27, 2, 9, 1, 25, 26, 29, 3, 9, 15, 16, 19, 21, 23, 24, 30, 31, 32, 9, 10, 14, 15, 16, 19, 20,
21, 23, 24, 27, 28, 29, 30, 31, 32, 33),
V2 = c(2, 3, 3, 4, 4, 4, 5, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 10, 11, 11, 11, 12, 13, 13,
14, 14, 14, 14, 17, 17, 18, 18, 20, 20, 22, 22, 26, 26, 28, 28, 28, 29, 30, 30, 31, 31, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34)),
.Names = c("V1", "V2"), row.names = c(NA, -78L), class = "data.frame")
theta <- seq(0,2,length.out=35)[1:34]
Xn <- cospi(theta)
Yn <- sinpi(theta)
Nv <- NROW(Xn)
Ne <- NROW(es)
vs <- data.frame(label = as.character(1:Nv))