Related
I'm trying to check the correlation of a bunch of variables and wanted to create a graph(s) of all the dependent variables on my response.
Price <- c(10,11,22,15,15)
Var1 <- c(2,3,12,5,17)
Var2 <- c(3,3,12,16,7)
Var3 <- c(2,5,2,5,18)
data <- data.frame(Var1,Var2,Var3,Price)
I was thinking something like this would work ;
variables <- c('Var1', 'Var2', 'Var3')
for (i in variables){
plot <- ggplot(data=data, aes(x = i, y=Price))+
geom_point(shape=16, color="dodgerblue")+
geom_smooth(method=lm, color='Black')
print(plot)
}
But it only prints out the response for variable 3 without any values of x.
As i in your loop is a character, you need to call it with get(i) in your ggplot:
for (i in variables){
plot <- ggplot(data=data, aes(x = get(i), y=Price))+
geom_point(shape=16, color="dodgerblue")+
geom_smooth(method=lm, color='Black')
print(plot)
}
will work.
Two alternatives to have the 3 graphs together:
alternative 1
Long format, and facet_wrap:
library(tidyr)
pivot_longer(data,paste0("Var",1:3)) %>%
ggplot(aes(value,Price))+
geom_point(shape=16, color="dodgerblue")+
geom_smooth(method=lm, color='Black')+
facet_wrap(~name)
second alternative
You could try to use the wonderful {patchwork} package also:
plot_list <- lapply(variables,function(i){
ggplot(data=data, aes(x = get(i), y=Price))+
geom_point(shape=16, color="dodgerblue")+
geom_smooth(method=lm, color='Black')+
labs(x = i)
})
library(patchwork)
wrap_plots(plot_list)
par works in low-level plotting.
par(mfrow=c(1, 3))
with(data, lapply(names(data)[1:3], \(x) {
plot(data[c(x, 'Price')]); abline(lm(Price ~ get(x)))
}))
Data:
data <- structure(list(Var1 = c(2, 3, 12, 5, 17), Var2 = c(3, 3, 12,
16, 7), Var3 = c(2, 5, 2, 5, 18), Price = c(10, 11, 22, 15, 15
)), class = "data.frame", row.names = c(NA, -5L))
New to posting on here. Apologies if I miss including something needed to solve my situation.
I have a matched case-control design where three 'younger' clinical cases have been age-matched to a 'younger' control group, and three 'older' cases have been matched to an 'older' control group. I am plotting the control group distribution in a violin plot and overlaying the corresponding matched cases as data points.
I have a lot of variables and I would like to loop through them to minimise error and increase efficiency. I have had a go at writing the code for the loop but I am not sure what to do with the fact that I have two types of plots (violin and point) and two data frames (controls and cases) involved.
Here is the code I have for the plots:
#fake data
cases <- data.frame(
id = factor(1:6),
strange_stories_ToM_mean = sample(6:8, 6, replace = TRUE),
age = factor(c(rep("young", 3), rep("old", 3)))
)
controls <- data.frame(
id = 7:23,
strange_stories_ToM_mean = sample(c(6,6,7,7,7,7,7,7,7,8,8,8,9,9,9,9,9), 17),
age = c(rep("young", 9), rep("old", 8))
)
#plots
ggplot(data = controls, aes(strange_stories_ToM_mean, age)) +
geom_violin(
trim = FALSE,
alpha = 0.2,
draw_quantiles = c(0.25, 0.5, 0.75),
fill = "gray90"
) +
geom_point(
data = cases,
aes(colour = id, shape = id), # map color/shape to individual cases
size = 5,
show.legend = FALSE
) +
scale_shape_manual(values=c(16, 17, 15, 16, 17, 15)) +
scale_colour_manual(values=c("deeppink1","indianred3", "blueviolet", "springgreen3", "chartreuse2", "darkgreen")) +
scale_size_manual(values=c(5, 4, 5, 5, 4, 5)) +
theme_classic()
ggsave("strange_stories_ToM_mean.svg", width = 8, height = 8, units = "cm")
I looked at using 'for' and created a list to loop through (what I have is below) but I came unstuck at where the list should be incorporated when two data frames are being used and two plots...could lapply be best?
variables <- list() # Create empty listfor(i in ncol(FTD_data)) { # Using for-loop to add all columns tolist variables[[i]] <- FTD_data[ , i]}
names(variables) <- colnames(FTD_data) #rename list elements with variable names from df
for (i in variables)
{CODE TO PLOT INSERT HERE}
One approach to achieve your desired result would be to put your plotting code inside a function which takes one argument, the name of the column to plot. The only change I made to your plotting code is to replace the hardcoded strange_stories_ToM_mean by .data[[col]] to tell ggplot I want to plot the data column whose name is stored in col.
Also, instead of using a for loop I would recommend to use lapply when using ggplot2:
library(ggplot2)
plot_fun <- function(col) {
ggplot(data = controls, aes(.data[[col]], age)) +
geom_violin(
trim = FALSE,
alpha = 0.2,
draw_quantiles = c(0.25, 0.5, 0.75),
fill = "gray90"
) +
geom_point(
data = cases,
aes(colour = id, shape = id),
size = 5,
show.legend = FALSE
) +
scale_shape_manual(values=c(16, 17, 15, 16, 17, 15)) +
scale_colour_manual(values=c("deeppink1","indianred3", "blueviolet", "springgreen3", "chartreuse2", "darkgreen")) +
scale_size_manual(values=c(5, 4, 5, 5, 4, 5)) +
theme_classic()
}
cols_to_plot <- names(controls)[!names(controls) %in% c("id", "age")]
names(cols_to_plot) <- cols_to_plot
p <- lapply(cols_to_plot, plot_fun)
lapply(cols_to_plot, function(x) ggsave(paste0(x, ".svg"), plot = p[[x]], width = 8, height = 8, units = "cm"))
#> $strange_stories_ToM_mean
#> [1] "strange_stories_ToM_mean.svg"
#>
#> $strange_stories_ToM_median
#> [1] "strange_stories_ToM_median.svg"
p
#> $strange_stories_ToM_mean
#>
#> $strange_stories_ToM_median
DATA
set.seed(123)
cases <- data.frame(
id = factor(1:6),
strange_stories_ToM_mean = sample(6:8, 6, replace = TRUE),
strange_stories_ToM_median = sample(6:8, 6, replace = TRUE),
age = factor(c(rep("young", 3), rep("old", 3)))
)
controls <- data.frame(
id = 7:23,
strange_stories_ToM_mean = sample(c(6,6,7,7,7,7,7,7,7,8,8,8,9,9,9,9,9), 17),
strange_stories_ToM_median = sample(c(6,6,7,7,7,7,7,7,7,8,8,8,9,9,9,9,9), 17),
age = c(rep("young", 9), rep("old", 8))
)
I have xy grouped data that I'm plotting using R's ggplot2 geom_violin adding regression trend lines:
Here are the data:
library(dplyr)
library(plotly)
library(ggplot2)
set.seed(1)
df <- data.frame(value = c(rnorm(500,8,1),rnorm(600,6,1.5),rnorm(400,4,0.5),rnorm(500,2,2),rnorm(400,4,1),rnorm(600,7,0.5),rnorm(500,3,1),rnorm(500,3,1),rnorm(500,3,1)),
age = c(rep("d3",500),rep("d8",600),rep("d24",400),rep("d3",500),rep("d8",400),rep("d24",600),rep("d3",500),rep("d8",500),rep("d24",500)),
group = c(rep("A",1500),rep("B",1500),rep("C",1500))) %>%
dplyr::mutate(time = as.integer(age)) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df$group_age <- factor(df$group_age,levels=unique(df$group_age))
And my current plot:
ggplot(df,aes(x=group_age,y=value,fill=age,color=age,alpha=0.5)) +
geom_violin() + geom_boxplot(width=0.1,aes(fill=age,color=age,middle=mean(value))) +
geom_smooth(data=df,mapping=aes(x=group_age,y=value,group=group),color="black",method='lm',size=1,se=T) + theme_minimal()
My questions are:
How do I get rid of the alpha part of the legend?
I would like the x-axis ticks to be df$group rather than df$group_age, which means a tick per each group at the center of that group where the label is group. Consider a situation where not all groups have all ages - for example, if a certain group has only two of the ages and I'm pretty sure ggplot will only present only these two ages, I'd like the tick to still be centered between their two ages.
One more question:
It would also be nice to have the p-values of each fitted slope plotted on top of each group.
I tried:
library(ggpmisc)
my.formula <- value ~ group_age
ggplot(df,aes(x=group_age,y=value,fill=age,color=age,alpha=0.5)) +
geom_violin() + geom_boxplot(width=0.1,aes(fill=age,color=age,middle=mean(value))) +
geom_smooth(data=df,mapping=aes(x=group_age,y=value,group=group),color="black",method='lm',size=1,se=T) + theme_minimal() +
stat_poly_eq(formula = my.formula,aes(label=stat(p.value.label)),parse=T)
But I get the same plot as above with the following warning message:
Warning message:
Computation failed in `stat_poly_eq()`:
argument "x" is missing, with no default
geom_smooth() fits a line, while stat_poly_eqn() issues an error. A factor is a categorical variable with unordered levels. A trend against a factor is undefined. geom_smooth() may be taking the levels and converting them to "arbitrary" numerical values, but these values are just indexes rather than meaningful values.
To obtain a plot similar to what is described in the question but using code that provides correct linear regression lines and the corresponding p-values I would use the code below. The main change is that the numerical variable time is mapped to x making the fitting of a regression a valid operation. To allow for a linear fit an x-scale with a log10 transformation is used, with breaks and labels at the ages for which data is available.
library(dplyr)
library(ggplot2)
library(ggpmisc)
set.seed(1)
df <-
data.frame(
value = c(
rnorm(500, 8, 1), rnorm(600, 6, 1.5), rnorm(400, 4, 0.5),
rnorm(500, 2, 2), rnorm(400, 4, 1), rnorm(600, 7, 0.5),
rnorm(500, 3, 1), rnorm(500, 3, 1), rnorm(500, 3, 1)
),
age = c(
rep("d3", 500), rep("d8", 600), rep("d24", 400),
rep("d3", 500), rep("d8", 400), rep("d24", 600),
rep("d3", 500), rep("d8", 500), rep("d24", 500)
),
group = c(rep("A", 1500), rep("B", 1500), rep("C", 1500))
) %>%
mutate(time = as.integer(gsub("d", "", age))) %>%
arrange(group, time) %>%
mutate(age = factor(age, levels = c("d3", "d8", "d24")),
group = factor(group))
my_formula = y ~ x
ggplot(df, aes(x = time, y = value)) +
geom_violin(aes(fill = age, color = age), alpha = 0.3) +
geom_boxplot(width = 0.1,
aes(color = age), fill = NA) +
geom_smooth(color = "black", formula = my_formula, method = 'lm') +
stat_poly_eq(aes(label = stat(p.value.label)),
formula = my_formula, parse = TRUE,
npcx = "center", npcy = "bottom") +
scale_x_log10(name = "Age", breaks = c(3, 8, 24)) +
facet_wrap(~group) +
theme_minimal()
Which creates the following figure:
Here is a solution. The alpha - legend issue is easy. Anything you place into the aes() functioning will get placed in a legend. This feature should be used when you want a feature of the data to be used as an aestetic. Putting alpha outside of an aes will remove it from the legend.
I'm not sure the x legend is what you wanted but i did it manually so it should be easy to configure.
Regarding the p.values, i did separate linear regressions and store the p.value in three different vectors which can be called into the ggplot using the annotate. For two of the groups the p.value was <.001 so the round functioning will round it to 0. Therefore, i just added p. <.001
Good luck with this!
library(dplyr)
library(ggplot2)
set.seed(1)
df <- data.frame(value = c(rnorm(500,8,1),rnorm(600,6,1.5),rnorm(400,4,0.5),rnorm(500,2,2),rnorm(400,4,1),rnorm(600,7,0.5),rnorm(500,3,1),rnorm(500,3,1),rnorm(500,3,1)),
age = c(rep("d3",500),rep("d8",600),rep("d24",400),rep("d3",500),rep("d8",400),rep("d24",600),rep("d3",500),rep("d8",500),rep("d24",500)),
group = c(rep("A",1500),rep("B",1500),rep("C",1500))) %>%
dplyr::mutate(time = as.integer(age)) %>%
dplyr::arrange(group,time) %>%
dplyr::mutate(group_age=paste0(group,"_",age))
df$group_age <- factor(df$group_age,levels=unique(df$group_age))
mod1 <- lm(value ~ time,df\[df$group == 'A',\])
mod1 <- summary(mod1)$coefficients\[8\] %>% round(2)
mod2 <- lm(value ~ time,df\[df$group == 'B',\])
mod2 <- summary(mod2)$coefficients\[8\] %>% round(2)
mod3 <- lm(value ~ time,df\[df$group == 'C',\])
mod3 <- summary(mod3)$coefficients\[8\] %>% round(2)
ggplot(df,aes(x=group_age,y=value,fill=age,color=age)) +
geom_violin(alpha=0.5) +
geom_boxplot(width=0.1,aes(fill=age,color=age,middle=mean(value))) +
geom_smooth(mapping=aes(x=group_age,y=value,group=group),color="black",method='lm',size=1,se=T) +
scale_x_discrete(labels = c('','A','','','B','','','C','')) +
annotate('text',x = 2,y = -1,label = paste('pvalue: <.001')) +
annotate('text',x = 6,y = 10,label = paste('pvalue: <.001')) +
annotate('text',x = 8,y = -1.2,label = paste('pvalue:',mod3))+
theme_minimal()
ID <- 1:10
group <- c(1,1,1,2,2,2,3,3,3,3)
var1 <- c(6:15)
var2 <- c(7:16)
var3 <- c(6:11, NA, NA, NA, NA)
var4 <- c(4:9, NA, NA, NA, NA)
data <- data.frame(ID, group, var1, var2, var3, var4)
library(dplyr)
data %>% group_by(group) %>% boxplot(var1, var2)
The last line does not work as i wish. The idea is to get 4 boxplots in one graphic. Two for each variable. Maybe i need to use ggplot2?
You need to reorganize the data if you want to get both variables in the same plot. Here is a ggplot2 solution:
# load library
library(ggplot2)
library(tidyr)
library(ggthemes)
# reorganize data
df <- gather(data, "ID","group")
#rename columns
colnames(df) <- c("ID","group","var","value")
# plot
ggplot(data=df) +
geom_boxplot( aes(x=factor(group), y=value, fill=factor(var)), position=position_dodge(1)) +
scale_x_discrete(breaks=c(1, 2, 3), labels=c("A", "B", "C")) +
theme_minimal() +
scale_fill_grey()
Making boxplots with the same width is a whole different question (solution here), but one simple alternative would be like this:
# recode column `group` in the `data.frame`.
df <- transform(df, group = ifelse(group==1, 'A', ifelse(group==2, 'B', "C")))
# plot
ggplot(data=df) +
geom_boxplot( aes(x=factor(var), y=value, fill=factor((var))), position=position_dodge(1)) +
geom_jitter(aes(x=factor(var), y=value, color=factor((var)))) +
facet_grid(.~group, scales = "free_x") +
theme_minimal()+
scale_fill_grey() +
theme(axis.text.x=element_blank(),
axis.title.x=element_blank(),
axis.ticks=element_blank())
You might try melting the data frame (mentioned in comment by #lukeA) first and then sticking to base graphics. ggplot2 or lattice are other good options.
library(reshape2)
DF <- melt(data, id.vars = c("ID", "group"), measure.vars = c("var1", "var2"))
boxplot(value ~ group + variable, DF)
Alternate lattice code, also using DF:
bwplot(~ value | variable + group, data = DF)
Alternate ggplot2 code, also using DF:
ggplot(DF, aes(x = factor(group), y = value, fill = variable)) + geom_boxplot()
Although quite late, a found a great base-R solution here
# Create some data, e.g. from https://en.wikipedia.org/wiki/One-way_analysis_of_variance#Example
df <- as.data.frame(matrix(c(6, 8, 13, 8, 12, 9, 4, 9, 11, 5, 11, 8, 3, 6, 7, 4, 8, 12),ncol = 3, byrow = TRUE))
df <- reshape(data = df, direction = "long", idvar=1:3, varying=1:3, sep = "", timevar = "Treatment")
df$Treatment <- as.factor(df$Treatment)
rownames(df) <- NULL
par(mfrow = c(2, 1))
par(mar=c(1,4,4,2) + 0.1) # mar=c(b,l,t,r)
boxplot(V ~ Treatment, data = df, xlab = NULL, xaxt = "n",
ylab = "V", main = "One-way anova with 3 different levels of one factor")
stripchart(V ~ Treatment, # Points
data = df, # Data
method = "jitter", # Random noise
pch = 19, # Pch symbols
col = 4, # Color of the symbol
vertical = TRUE, # Vertical mode
add = TRUE) # Add it over
par(mar=c(5,4,0,2) + 0.1)
boxplot(V ~ Treatment, data = df, xlab = "Treatment",
ylab = "V", main = NULL)
stripchart(V ~ Treatment, # Points
data = df, # Data
method = "overplot", # Random noise
pch = 19, # Pch symbols
col = 4, # Color of the symbol
vertical = TRUE, # Vertical mode
add = TRUE) # Add it over
par(mfrow = c(1, 1))
Result:
For multiple (here: two) value lists I want to
plot values as line or points into one diagram
plot histograms into another diagram and
assign the same color to the respective line plot and histogram plot
I've come up with a combination of two examples using ggplot2, which is still using different colors for line plot and histograms. Also it may be a bit redundant, creating
How can I get the same color for line plot and histogram?
Bonus: How can I shorten the piece of used source code?
my result so far:
Source Code (R):
# input data lists
vals_x <- c(4, 3, 6, 7, 4, 6, 9, 3, 0, 8, 3, 7, 7, 5, 9, 0)
vals_y <- c(6, 6, 4, 8, 0, 3, 7, 3, 1, 8, 2, 1, 2, 3, 6, 5)
# ------------------------------------------------
library(ggplot2)
library(gridExtra)
# prepare data for plotting
df <- rbind( data.frame( fill = "blue", obs = vals_x),
data.frame( fill = "red", obs = vals_y))
test_data <- data.frame(
var0 = vals_x,
var1 = vals_y,
idx = seq(length(vals_x)))
stacked <- with(test_data,
data.frame(value = c(var0, var1),
variable = factor(rep(c("Values x","Values y"),
each = NROW(test_data))),
idx = rep(idx, 2),
fill_col = c( rep("blue", length(vals_x)),
rep("red", length(vals_y)))))
# plot line
p_line <- ggplot(stacked, aes(idx, value, colour = variable)) +
geom_line()
# plot histogram
p_hist <- ggplot( df, aes(x=obs, fill = fill)) +
geom_histogram(binwidth=2, colour="black", position="dodge") +
scale_fill_identity()
# arrange diagrams
grid.arrange( p_line, p_hist, ncol = 2)
The easiest thing to do is
Use the same data set in each ggplot object
Then use scale_*_manual (or some other scale call).
So
## Particularly awful colours
p_hist = ggplot(stacked, aes(x=value, fill=variable)) +
geom_histogram(binwidth=2, colour="black", position="dodge") +
scale_fill_manual(values=c("red", "yellow"))
p_line = ggplot(stacked, aes(idx, value, colour = variable)) +
geom_line() +
scale_colour_manual(values=c("red", "yellow"))
As an aside, I wouldn't use a histogram here; a boxplot or density plot would be much better.