Related
desired_output_sample
I have following data:
#1. dates of 15 day frequency:
dates = seq(as.Date("2016-09-01"), as.Date("2020-07-30"), by=15) #96 times observation
#2. water content in crops corresponding to the times given.
water <- c(0.5702722, 0.5631781, 0.5560839, 0.5555985, 0.5519783, 0.5463459,
0.5511598, 0.546652, 0.5361545, 0.530012, 0.5360571, 0.5396569,
0.5683526, 0.6031535, 0.6417821, 0.671358, 0.7015542, 0.7177007,
0.7103561, 0.7036985, 0.6958607, 0.6775161, 0.6545367, 0.6380155,
0.6113306, 0.5846186, 0.5561815, 0.5251135, 0.5085149, 0.495352,
0.485819, 0.4730029, 0.4686458, 0.4616468, 0.4613918, 0.4615532,
0.4827496, 0.5149105, 0.5447824, 0.5776764, 0.6090217, 0.6297454,
0.6399422, 0.6428941, 0.6586344, 0.6507473, 0.6290631, 0.6011123,
0.5744375, 0.5313527, 0.5008027, 0.4770338, 0.4564025, 0.4464508,
0.4309046, 0.4351668, 0.4490393, 0.4701232, 0.4911582, 0.5162941,
0.5490387, 0.5737573, 0.6031149, 0.6400073, 0.6770058, 0.7048311,
0.7255012, 0.739107, 0.7338938, 0.7265202, 0.6940718, 0.6757214,
0.6460862, 0.6163091, 0.5743775, 0.5450822, 0.5057753, 0.4715266,
0.4469859, 0.4303232, 0.4187793, 0.4119401, 0.4201316, 0.426369,
0.4419331, 0.4757525, 0.5070846, 0.5248457, 0.5607567, 0.5859825,
0.6107531, 0.6201754, 0.6356589, 0.6336177, 0.6275579, 0.6214981)
I want to compute trend of the water content or moisture data corresponding to different subperiods. Lets say: one trend from 2016 - 09-01 to 2019-11-30.
and other trend from 2019-12-15 to the last date (in this case 2020-07-27).
And I want to make a plot like the one attached.
Appreciate your help. Can be in R or in python.
To draw a trend line, you can look on this tutorial
https://www.statology.org/ggplot-trendline/
Or on this stackoverflow question
Draw a trend line using ggplot
To split your dataset in two groups you simply need to do something like this (in R).
data <- data.frame(dates, water)
#This neat trick allows you to turn a logical value into a number
data$group <- 1 + (data$dates > "2019-11-30")
old <- subset(data,group == 1)
new <- subset(data,group == 2)
For the plots:
library(ggplot2)
ggplot(old,aes(x = dates, y = water)) +
geom_smooth(method = "lm", col = "blue") +
geom_point()
ggplot(new,aes(x = dates, y = water)) +
geom_smooth(method = "lm", col = "red") +
geom_point()
Here is a full-fledged example with added labels:
library(dplyr)
library(ggplot2)
dates <- seq(as.Date("2016-09-01"), as.Date("2020-07-30"), by=15)
wc <- as.numeric(strsplit("0.5702722 0.5631781 0.5560839 0.5555985 0.5519783 0.5463459 0.5511598 0.5466520 0.5361545 0.5300120 0.5360571 0.5396569 0.5683526 0.6031535 0.6417821 0.6713580 0.7015542 0.7177007 0.7103561 0.7036985 0.6958607 0.6775161 0.6545367 0.6380155 0.6113306 0.5846186 0.5561815 0.5251135 0.5085149 0.4953520 0.4858190 0.4730029 0.4686458 0.4616468 0.4613918 0.4615532 0.4827496 0.5149105 0.5447824 0.5776764 0.6090217 0.6297454 0.6399422 0.6428941 0.6586344 0.6507473 0.6290631 0.6011123 0.5744375 0.5313527 0.5008027 0.4770338 0.4564025 0.4464508 0.4309046 0.4351668 0.4490393 0.4701232 0.4911582 0.5162941 0.5490387 0.5737573 0.6031149 0.6400073 0.6770058 0.7048311 0.7255012 0.7391070 0.7338938 0.7265202 0.6940718 0.6757214 0.6460862 0.6163091 0.5743775 0.5450822 0.5057753 0.4715266 0.4469859 0.4303232 0.4187793 0.4119401 0.4201316 0.4263690 0.4419331 0.4757525 0.5070846 0.5248457 0.5607567 0.5859825 0.6107531 0.6201754 0.6356589 0.6336177 0.6275579 0.6214981", " |\\n")[[1]])
data <- data.frame(date=dates, water_content=wc) %>%
mutate(group = ifelse(date <= as.Date("2019-11-30"), "g1", "g2"))
# calculate linear regression and create labels
lmo <- data %>%
group_by(group) %>%
summarise(res=list(stats::lm(water_content ~ date, data = cur_data()))) %>%
.$res
lab <- sapply(lmo, \(x)
paste("Slope=", signif(x$coef[[2]], 5),
"\nAdj R2=", signif(summary(x)$adj.r.squared, 5),
"\nP=", signif(summary(x)$coef[2,4], 5)))
ggplot(data=data, aes(x=date, y=water_content, col=group)) +
geom_point() +
stat_smooth(geom="smooth", method="lm") +
geom_text(aes(date, y, label=lab),
data=data.frame(data %>% group_by(group) %>%
summarise(date=first(date)), y=Inf, lab=lab),
vjust=1, hjust=.2)
Created on 2022-11-23 with reprex v2.0.2
Here is a way. Create a grouping variable by dates, coerce it to factor and geom_smooth will draw the two regression lines.
suppressPackageStartupMessages({
library(ggplot2)
library(ggpubr)
})
df1 <- data.frame(dates, water)
breakpoint <- as.Date("2019-11-30")
df1$group <- factor(df1$dates > breakpoint, labels = c("before", "after"))
ggplot(df1, aes(dates, water, colour = group)) +
geom_line() +
geom_point(shape = 21, fill = 'white') +
geom_smooth(formula = y ~ x, method = lm) +
geom_vline(xintercept = breakpoint, linetype = "dotdash", linewidth = 1) +
stat_cor(label.y = c(0.43, 0.38), show.legend = FALSE) +
stat_regline_equation(label.y = c(0.45, 0.4), show.legend = FALSE) +
scale_color_manual(values = c(before = 'red', after = 'blue')) +
theme_bw(base_size = 15)
Created on 2022-11-23 with reprex v2.0.2
I would like to group a series of lines by 2 factors using group = interaction in ggplot. Here is some sample code:
set.seed(123)
N <- 18
means <- rnorm(N,0,1)
ses <- rexp(N,2)
upper<- means+qnorm(0.975)*ses
lower<- means+qnorm(0.025)*ses
fruit <- rep(c("Apples","Bananas","Pears"), each=6)
size <- rep(rep(c("Small","Medium","Big"), each=2),3)
GMO <- rep(c("Yes","No"), 9)
d<- data.frame(means,upper,lower,fruit,size,GMO)
ggplot(data=d,
aes(x = fruit,y = means, ymin = lower, ymax = upper, col=size,linetype=GMO,group=interaction(GMO, size)))+
geom_hline(aes(fill=size),yintercept =1, linetype=2)+
xlab('labels')+ ylab("Parameter estimates (95% Confidence Interval)")+
geom_pointrange(position=position_dodge(width = 0.6)) +
scale_x_discrete(name="Fruits")+
coord_flip()-> fplot
dev.new()
fplot
Here's a link to the resulting graph: https://i.stack.imgur.com/5YF4F.png
I would like to bring the same coloured lines for each of the three groups closer together. In other words I would like the lines to cluster not only by the 'Fruit' variable but also the 'Size' variable for each of the fruits. poisition_dodge seems to only work for one of the interacting groups.
Thanks for your advice.
As far as I know that is not possible with position_dodge, i.e. it dodges according to the categories of the group aes. And it does not matter whether you map one variable on the group aes or an interaction of two or more. The groups are simply placed equidistant from one another.
One option to achieve your desired result would be to use the "facets that don't look like facets" trick which means faceting by fruit, mapping size on x and afterwards using theme options to get rid of the facet look plus some tweaking of the x scale:
set.seed(123)
N <- 18
means <- rnorm(N, 0, 1)
ses <- rexp(N, 2)
upper <- means + qnorm(0.975) * ses
lower <- means + qnorm(0.025) * ses
fruit <- rep(c("Apples", "Bananas", "Pears"), each = 6)
size <- rep(rep(c("Small", "Medium", "Big"), each = 2), 3)
GMO <- rep(c("Yes", "No"), 9)
d <- data.frame(means, upper, lower, fruit, size, GMO)
library(ggplot2)
ggplot(data = d, aes(x = size, y = means, ymin = lower, ymax = upper, col = size, linetype = GMO, group = GMO)) +
geom_hline(yintercept = 1, linetype = 2) +
xlab("labels") +
ylab("Parameter estimates (95% Confidence Interval)") +
geom_pointrange(position = position_dodge(width = 0.6)) +
scale_x_discrete(name = "Fruits", breaks = "Medium", labels = NULL, expand = c(0, 1)) +
coord_flip() +
facet_grid(fruit ~ ., switch = "y") +
theme(strip.placement = "outside",
strip.background.y = element_blank(),
strip.text.y.left = element_text(angle = 0),
panel.spacing.y = unit(0, "pt"))
Maybe you want to facet_wrap your size variable:
set.seed(123)
N <- 18
means <- rnorm(N,0,1)
ses <- rexp(N,2)
upper<- means+qnorm(0.975)*ses
lower<- means+qnorm(0.025)*ses
fruit <- rep(c("Apples","Bananas","Pears"), each=6)
size <- rep(rep(c("Small","Medium","Big"), each=2),3)
GMO <- rep(c("Yes","No"), 9)
d<- data.frame(means,upper,lower,fruit,size,GMO)
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.1.2
ggplot(data=d,
aes(x = fruit,y = means, ymin = lower, ymax = upper, col=size,linetype=GMO,group=interaction(GMO, size)))+
geom_hline(aes(fill=size),yintercept =1, linetype=2)+
xlab('labels')+ ylab("Parameter estimates (95% Confidence Interval)")+
geom_pointrange(position=position_dodge(width = 0.6)) +
scale_x_discrete(name="Fruits")+
coord_flip() +
facet_wrap(~size)-> fplot
#> Warning: geom_hline(): Ignoring `mapping` because `yintercept` was provided.
fplot
Created on 2022-07-13 by the reprex package (v2.0.1)
I have recently came across a problem with ggplot2::geom_density that I am not able to solve. I am trying to visualise a density of some variable and compare it to a constant. To plot the density, I am using the ggplot2::geom_density. The variable for which I am plotting the density, however, happens to be a constant (this time):
df <- data.frame(matrix(1,ncol = 1, nrow = 100))
colnames(df) <- "dummy"
dfV <- data.frame(matrix(5,ncol = 1, nrow = 1))
colnames(dfV) <- "latent"
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.2, position = "identity") +
geom_vline(data = dfV, aes(xintercept = latent, color = 'ls'), size = 2)
This is OK and something I would expect. But, when I shift this distribution to the far right, I get a plot like this:
df <- data.frame(matrix(71,ncol = 1, nrow = 100))
colnames(df) <- "dummy"
dfV <- data.frame(matrix(75,ncol = 1, nrow = 1))
colnames(dfV) <- "latent"
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.2, position = "identity") +
geom_vline(data = dfV, aes(xintercept = latent, color = 'ls'), size = 2)
which probably means that the kernel estimation is still taking 0 as the centre of the distribution (right?).
Is there any way to circumvent this? I would like to see a plot like the one above, only the centre of the kerner density would be in 71 and the vline in 75.
Thanks
Well I am not sure what the code does, but I suspect the geom_density primitive was not designed for a case where the values are all the same, and it is making some assumptions about the distribution that are not what you expect. Here is some code and a plot that sheds some light:
# Generate 10 data sets with 100 constant values from 0 to 90
# and then merge them into a single dataframe
dfs <- list()
for (i in 1:10){
v <- 10*(i-1)
dfs[[i]] <- data.frame(dummy=rep(v,100),facet=v)
}
df <- do.call(rbind,dfs)
# facet plot them
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.5, position = "identity") +
facet_wrap( ~ facet,ncol=5 )
Yielding:
So it is not doing what you thought it was, but it is also probably not doing what you want. You could of course make it "translation-invariant" (almost) by adding some noise like this for example:
set.seed(1234)
noise <- +rnorm(100,0,1e-3)
dfs <- list()
for (i in 1:10){
v <- 10*(i-1)
dfs[[i]] <- data.frame(dummy=rep(v,100)+noise,facet=v)
}
df <- do.call(rbind,dfs)
ggplot() +
geom_density(data = df, aes(x = dummy, colour = 's'),
fill = '#FF6666', alpha = 0.5, position = "identity") +
facet_wrap( ~ facet,ncol=5 )
Yielding:
Note that there is apparently a random component to the geom_density function, and I can't see how to set the seed before each instance, so the estimated density is a bit different each time.
this is my first stack overflow post and I am a relatively new R user, so please go gently!
I have a data frame with three columns, a participant identifier, a condition (factor with 2 levels either Placebo or Experimental), and an outcome score.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
I would like to construct a bar plot with two bars with the mean outcome score for each condition and the standard deviation as an error bar. I would like to then overlay lines connecting points for each participant's score in each condition. So the plot displays the individual response as well as the group mean.If it is also possible I would like to include an axis break.
I don't seem to be able to find any advice in other threads, apologies if I am repeating a question.
Many Thanks.
p.s. I realise that presenting data in this way will not be to everyones tastes. It is for a specific requirement!
This ought to work:
library(ggplot2)
library(dplyr)
dat.summ <- dat %>% group_by(Condition) %>%
summarize(mean.outcome = mean(Outcome),
sd.outcome = sd(Outcome))
ggplot(dat.summ, aes(x = Condition, y = mean.outcome)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = mean.outcome - sd.outcome,
ymax = mean.outcome + sd.outcome),
color = "dodgerblue", width = 0.3) +
geom_point(data = dat, aes(x = Condition, y = Outcome),
color = "firebrick", size = 1.2) +
geom_line(data = dat, aes(x = Condition, y = Outcome, group = ID),
color = "firebrick", size = 1.2, alpha = 0.5) +
scale_y_continuous(limits = c(0, max(dat$Outcome)))
Some people are better with ggplot's stat functions and arguments than I am and might do it differently. I prefer to just transform my data first.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
dat.w <- reshape(dat, direction = 'wide', idvar = 'ID', timevar = 'Condition')
means <- colMeans(dat.w[, 2:3])
sds <- apply(dat.w[, 2:3], 2, sd)
ci.l <- means - sds
ci.u <- means + sds
ci.width <- .25
bp <- barplot(means, ylim = c(0,20))
segments(bp, ci.l, bp, ci.u)
segments(bp - ci.width, ci.u, bp + ci.width, ci.u)
segments(bp - ci.width, ci.l, bp + ci.width, ci.l)
segments(x0 = bp[1], x1 = bp[2], y0 = dat.w[, 2], y1 = dat.w[, 3], col = 1:10)
points(c(rep(bp[1], 10), rep(bp[2], 10)), dat$Outcome, col = 1:10, pch = 19)
Here is a method using the transfomations inside ggplot2
ggplot(dat) +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.y="mean", geom="bar") +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.data="mean_se", geom="errorbar", col="green", width=.8, size=2) +
geom_line(aes(x=Condition, y=Outcome, group=ID), col="red")
It's common to put stars on barplots or boxplots to show the level of significance (p-value) of one or between two groups, below are several examples:
The number of stars are defined by p-value, for example one can put 3 stars for p-value < 0.001, two stars for p-value < 0.01, and so on (although this changes from one article to the other).
And my questions: How to generate similar charts? The methods that automatically put stars based on significance level are more than welcome.
I know that this is an old question and the answer by Jens Tierling already provides one solution for the problem. But I recently created a ggplot-extension that simplifies the whole process of adding significance bars: ggsignif
Instead of tediously adding the geom_line and geom_text to your plot you just add a single layer geom_signif:
library(ggplot2)
library(ggsignif)
ggplot(iris, aes(x=Species, y=Sepal.Length)) +
geom_boxplot() +
geom_signif(comparisons = list(c("versicolor", "virginica")),
map_signif_level=TRUE)
To create a more advanced plot similar to the one shown by Jens Tierling, you can do:
dat <- data.frame(Group = c("S1", "S1", "S2", "S2"),
Sub = c("A", "B", "A", "B"),
Value = c(3,5,7,8))
ggplot(dat, aes(Group, Value)) +
geom_bar(aes(fill = Sub), stat="identity", position="dodge", width=.5) +
geom_signif(stat="identity",
data=data.frame(x=c(0.875, 1.875), xend=c(1.125, 2.125),
y=c(5.8, 8.5), annotation=c("**", "NS")),
aes(x=x,xend=xend, y=y, yend=y, annotation=annotation)) +
geom_signif(comparisons=list(c("S1", "S2")), annotations="***",
y_position = 9.3, tip_length = 0, vjust=0.4) +
scale_fill_manual(values = c("grey80", "grey20"))
Full documentation of the package is available at CRAN.
Please find my attempt below.
First, I created some dummy data and a barplot which can be modified as we wish.
windows(4,4)
dat <- data.frame(Group = c("S1", "S1", "S2", "S2"),
Sub = c("A", "B", "A", "B"),
Value = c(3,5,7,8))
## Define base plot
p <-
ggplot(dat, aes(Group, Value)) +
theme_bw() + theme(panel.grid = element_blank()) +
coord_cartesian(ylim = c(0, 15)) +
scale_fill_manual(values = c("grey80", "grey20")) +
geom_bar(aes(fill = Sub), stat="identity", position="dodge", width=.5)
Adding asterisks above a column is easy, as baptiste already mentioned. Just create a data.frame with the coordinates.
label.df <- data.frame(Group = c("S1", "S2"),
Value = c(6, 9))
p + geom_text(data = label.df, label = "***")
To add the arcs that indicate a subgroup comparison, I computed parametric coordinates of a half circle and added them connected with geom_line. Asterisks need new coordinates, too.
label.df <- data.frame(Group = c(1,1,1, 2,2,2),
Value = c(6.5,6.8,7.1, 9.5,9.8,10.1))
# Define arc coordinates
r <- 0.15
t <- seq(0, 180, by = 1) * pi / 180
x <- r * cos(t)
y <- r*5 * sin(t)
arc.df <- data.frame(Group = x, Value = y)
p2 <-
p + geom_text(data = label.df, label = "*") +
geom_line(data = arc.df, aes(Group+1, Value+5.5), lty = 2) +
geom_line(data = arc.df, aes(Group+2, Value+8.5), lty = 2)
Lastly, to indicate comparison between groups, I built a larger circle and flattened it at the top.
r <- .5
x <- r * cos(t)
y <- r*4 * sin(t)
y[20:162] <- y[20] # Flattens the arc
arc.df <- data.frame(Group = x, Value = y)
p2 + geom_line(data = arc.df, aes(Group+1.5, Value+11), lty = 2) +
geom_text(x = 1.5, y = 12, label = "***")
There is also an extension of the ggsignif package called ggpubr that is more powerful when it comes to multi-group comparisons. It builds on top of ggsignif, but also handles anova and kruskal-wallis as well as pairwise comparisons against the gobal mean.
Example:
library(ggpubr)
my_comparisons = list( c("0.5", "1"), c("1", "2"), c("0.5", "2") )
ggboxplot(ToothGrowth, x = "dose", y = "len",
color = "dose", palette = "jco")+
stat_compare_means(comparisons = my_comparisons, label.y = c(29, 35, 40))+
stat_compare_means(label.y = 45)
I found this one is useful.
library(ggplot2)
library(ggpval)
data("PlantGrowth")
plt <- ggplot(PlantGrowth, aes(group, weight)) +
geom_boxplot()
add_pval(plt, pairs = list(c(1, 3)), test='wilcox.test')
Made my own function:
ts_test <- function(dataL,x,y,method="t.test",idCol=NULL,paired=F,label = "p.signif",p.adjust.method="none",alternative = c("two.sided", "less", "greater"),...) {
options(scipen = 999)
annoList <- list()
setDT(dataL)
if(paired) {
allSubs <- dataL[,.SD,.SDcols=idCol] %>% na.omit %>% unique
dataL <- dataL[,merge(.SD,allSubs,by=idCol,all=T),by=x] #idCol!!!
}
if(method =="t.test") {
dataA <- eval(parse(text=paste0(
"dataL[,.(",as.name(y),"=mean(get(y),na.rm=T),sd=sd(get(y),na.rm=T)),by=x] %>% setDF"
)))
res<-pairwise.t.test(x=dataL[[y]], g=dataL[[x]], p.adjust.method = p.adjust.method,
pool.sd = !paired, paired = paired,
alternative = alternative, ...)
}
if(method =="wilcox.test") {
dataA <- eval(parse(text=paste0(
"dataL[,.(",as.name(y),"=median(get(y),na.rm=T),sd=IQR(get(y),na.rm=T,type=6)),by=x] %>% setDF"
)))
res<-pairwise.wilcox.test(x=dataL[[y]], g=dataL[[x]], p.adjust.method = p.adjust.method,
paired = paired, ...)
}
#Output the groups
res$p.value %>% dimnames %>% {paste(.[[2]],.[[1]],sep="_")} %>% cat("Groups ",.)
#Make annotations ready
annoList[["label"]] <- res$p.value %>% diag %>% round(5)
if(!is.null(label)) {
if(label == "p.signif"){
annoList[["label"]] %<>% cut(.,breaks = c(-0.1, 0.0001, 0.001, 0.01, 0.05, 1),
labels = c("****", "***", "**", "*", "ns")) %>% as.character
}
}
annoList[["x"]] <- dataA[[x]] %>% {diff(.)/2 + .[-length(.)]}
annoList[["y"]] <- {dataA[[y]] + dataA[["sd"]]} %>% {pmax(lag(.), .)} %>% na.omit
#Make plot
coli="#0099ff";sizei=1.3
p <-ggplot(dataA, aes(x=get(x), y=get(y))) +
geom_errorbar(aes(ymin=len-sd, ymax=len+sd),width=.1,color=coli,size=sizei) +
geom_line(color=coli,size=sizei) + geom_point(color=coli,size=sizei) +
scale_color_brewer(palette="Paired") + theme_minimal() +
xlab(x) + ylab(y) + ggtitle("title","subtitle")
#Annotate significances
p <-p + annotate("text", x = annoList[["x"]], y = annoList[["y"]], label = annoList[["label"]])
return(p)
}
Data and call:
library(ggplot2);library(data.table);library(magrittr);
df_long <- rbind(ToothGrowth[,-2],data.frame(len=40:50,dose=3.0))
df_long$ID <- data.table::rowid(df_long$dose)
ts_test(dataL=df_long,x="dose",y="len",idCol="ID",method="wilcox.test",paired=T)
Result: