I am investigating the correlation between sensory data and chemical measurements using PLS regression from the pls package. Ultimately, I want to display the results in a correlation loading plot as illustrated by the example below. So far I managed to make the plot with X and Y correlation matrices but I haven't figured out a way to project the observations on the plot.
As an example, I use the oliveoil data set from the pls package. I computed the correlation loadings (using the method described here) and created a correlation plot using ggplot2 (This can be done in a simple manner using the plsdepot package but I like the versatility of ggplot):
library(pls)
data("oliveoil")
oil <- plsr(sensory ~ chemical, scale = TRUE, data = oliveoil)
scores <- oil$scores
sc1 <- scores[,1]
sc2 <- scores[,2]
scores <- as.data.frame(cbind(sc1, sc2))
cl_plsr <- cor(model.matrix(oil), scores)
df_cor <- as.data.frame(cl_plsr)
df_depend_cor <- as.data.frame(cor(oliveoil$sensory, scores))
plot_loading_correlation <- rbind(df_cor, df_depend_cor)
plot_loading_correlation1 <- setNames(plot_loading_correlation, c("comp1", "comp2"))
#Function to draw circle
circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
dat_plsr <- circleFun(c(0,0),2,npoints = 100)
library(ggplot2)
library(ggrepel)
p <- ggplot(data=plot_loading_correlation1, aes(comp1, comp2))+
theme_bw() +
geom_hline(aes(yintercept = 0), size=.2, linetype = 3)+
geom_vline(aes(xintercept = 0), size=.2, linetype = 3)+
geom_text_repel(aes(label = rownames(plot_loading_correlation1),
colour = c("black","black","black","black","black",
"red","red","red","red","red","red")))+
scale_color_manual(values=c("blue","red"))+
scale_x_continuous(breaks = seq(-1,2.5, by=0.5))+
scale_y_continuous(breaks = seq(-1.5,2.5, by=0.5))+
coord_fixed(ylim=c(-1, 1), xlim=c(-1, 1)) + xlab("PC 1") + ylab("PC 2")+
geom_path(data=dat_plsr ,
aes(x,y), colour = "darkgrey")+
theme(legend.title=element_blank())+
theme(axis.ticks = element_line(colour = "black"))+
theme(axis.title = element_text(colour = "black"))+
theme(axis.text = element_text(color="black"))+
theme(legend.position='none')+
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major = element_blank()) +
geom_point(data = plot_loading_correlation1,
aes(x=comp1, y=comp2),
colour = c("blue","blue","blue","blue","blue",
"red","red","red","red","red","red"),
shape = c(21,21,21,21,21,22,22,22,22,22,22),
fill = c("blue","blue","blue","blue","blue",
"red","red","red","red","red","red"),
size = 2.2)
p
How can I project individual observations to that plot as illustrated in the example above? Should the scores be scaled so that they fit on the correlation loadings scale (from -1 to 1)? And is that acceptable scientifically?
Related
I am including marginal distribution plots on a scatterplot of a continuous and integer variable. However, in the integer variable maringal distribution plot (y-axis) there is this zig-zag pattern that shows up because the y-values are all integers. Is there any way to increase the "width" (not sure that's the right term) of the bins/values the function calculates the distribution density over?
The goal is to get rid of that zig-zag pattern that develops because the y-values are integers.
library(GlmSimulatoR)
library(ggplot2)
library(patchwork)
### Create right-skewed dataset that has one continous variable and one integer variable
set.seed(123)
df1 <- data.frame(matrix(ncol = 2, nrow = 1000))
x <- c("int","cont")
colnames(df1) <- x
df1$int <- round(rgamma(1000, shape = 1, scale = 1),0)
df1$cont <- round(rgamma(1000, shape = 1, scale = 1),1)
p1 <- ggplot(data = df1, aes(x = cont, y = int)) +
geom_point(shape = 21, size = 2, color = "black", fill = "black", stroke = 1, alpha = 0.4) +
xlab("Continuous Value") +
ylab("Integer Value") +
theme_bw() +
theme(panel.grid = element_blank(),
text = element_text(size = 16),
axis.text.x = element_text(size = 16, color = "black"),
axis.text.y = element_text(size = 16, color = "black"))
dens1 <- ggplot(df1, aes(x = cont)) +
geom_density(alpha = 0.4) +
theme_void() +
theme(legend.position = "none")
dens2 <- ggplot(df1, aes(x = int)) +
geom_density(alpha = 0.4) +
theme_void() +
theme(legend.position = "none") +
coord_flip()
dens1 + plot_spacer() + p1 + dens2 +
plot_layout(ncol = 2, nrow = 2, widths = c(6,1), heights = c(1,6))
From ?geom_density:
adjust: A multiplicate [sic] bandwidth adjustment. This makes it possible
to adjust the bandwidth while still using the a bandwidth
estimator. For example, ‘adjust = 1/2’ means use half of the
default bandwidth.
So as a start try e.g. geom_density(..., adjust = 2) (bandwidth twice as wide as default) and go from there.
I can successfully create plots of power vs. sample size in R using the pwr package. Example code below.
library(pwr)
library(tidyverse)
plot.out <- pwr.t2n.test(n1=30, n2=30, d=0.5, alternative="two.sided")
#See output in link below
plot(plot.out)
plot() output
I would like to create a similar plot -- a two-sample t-test in which effect size is on the y-axis and power is on the x-axis, with fixed sample sizes.
Is there a way to do this using pwr and/or the plot function? Or would I have to unlist the plot.out object and use it somehow?
I'm still new to power curves in R. Thanks in advance for any advice.
In the code below the power is computed in a loop on effect size d_seq. Then the power d is extracted from the results list, a data.frame is created and plotted.
library(pwr)
library(ggplot2)
d_seq <- seq(0, 2, by = 0.1)
pwr_list <- lapply(d_seq, function(d){
pwr.t2n.test(n1 = 30, n2 = 30,
d = d,
power = NULL,
sig.level = 0.05,
alternative = "two.sided")
})
pwr <- sapply(pwr_list, '[[', 'power')
dfpwr <- data.frame(power = pwr, effect.size = d_seq)
ggplot(dfpwr, aes(effect.size, power)) +
geom_point(size = 2, colour = "black") +
geom_line(size = 0.5, colour = "red") +
scale_y_continuous(labels = scales::percent) +
xlab("effect size") +
ylab(expression("test power =" ~ 1 - beta))
To draw a line where power is 80% and get the effect size, first compute the effect size from the pwr vector by linear interpolation.
pwr80 <- approx(x = pwr, y = d_seq, xout = 0.8)
Now create a label for geom_text and plot it.
lbl80 <- paste("Power = 80%\n")
lbl80 <- paste(lbl80, "Effect size =", round(pwr80$y, 2))
ggplot(dfpwr, aes(effect.size, power)) +
geom_point(size = 2, colour = "black") +
geom_line(size = 0.5, colour = "red") +
geom_hline(yintercept = 0.8, linetype = "dotted") +
geom_text(x = pwr80$y, y = pwr80$x,
label = lbl80,
hjust = 1, vjust = -1) +
scale_y_continuous(labels = scales::percent) +
xlab("effect size") +
ylab(expression("test power =" ~ 1 - beta))
To also draw a vertical line, add
geom_vline(xintercept = pwr80$y, linetype = "dotted")
This code
library(ggplot2)
library(MASS)
# Generate gamma rvs
x <- rgamma(100000, shape = 2, rate = 0.2)
den <- density(x)
dat <- data.frame(x = den$x, y = den$y)
ggplot(data = dat, aes(x = x, y = y)) +
geom_point(size = 3) +
theme_classic()
# Fit parameters (to avoid errors, set lower bounds to zero)
fit.params <- fitdistr(estimate, "gamma", lower = c(0, 0))
# Plot using density points
ggplot(data = dat, aes(x = x,y = y)) +
geom_point(size = 3) +
geom_line(aes(x=dat$x, y=dgamma(dat$x,fit.params$estimate["shape"], fit.params$estimate["rate"])),
color="red", size = 1) +
theme_classic()
fits and plots the distribution of series x. The resulting plot is:
Packages stats and MASS seem not to support the Rayleigh distribution. How can I extend the previous code to the Rayleigh distribution?
In the code below I start by recreating the vector x, this time setting the RNG seed, in order to make the results reproducible. Then a data.frame dat with only that vector is also recreated.
The density functions of the Gamma and Rayleigh distributions are fit to the histogram of x by first estimating their parameters and with stat_function.
library(ggplot2)
library(MASS)
library(extraDistr) # for the Rayleigh distribution functions
# Generate gamma rvs
set.seed(2020)
x <- rgamma(100000, shape = 2, rate = 0.2)
dat <- data.frame(x)
# Fit parameters (to avoid errors, set lower bounds to zero)
fit.params <- fitdistr(dat$x, "gamma", lower = c(0, 0))
ggplot(data = dat, aes(x = x)) +
geom_histogram(aes(y = ..density..), bins = nclass.Sturges(x)) +
stat_function(fun = dgamma,
args = list(shape = fit.params$estimate["shape"],
rate = fit.params$estimate["rate"]),
color = "red", size = 1) +
ggtitle("Gamma density") +
theme_classic()
fit.params.2 <- fitdistrplus::fitdist(dat$x, "rayleigh", start = list(sigma = 1))
fit.params.2$estimate
ggplot(data = dat, aes(x = x)) +
geom_histogram(aes(y = ..density..), bins = nclass.Sturges(x)) +
stat_function(fun = drayleigh,
args = list(sigma = fit.params.2$estimate),
color = "blue", size = 1) +
ggtitle("Rayleigh density") +
theme_classic()
To plot points and lines like in the question, not histograms, use the code below.
den <- density(x)
orig <- data.frame(x = den$x, y = den$y)
ggplot(data = orig, aes(x = x)) +
geom_point(aes(y = y), size = 3) +
geom_line(aes(y = dgamma(x, fit.params$estimate["shape"], fit.params$estimate["rate"])),
color="red", size = 1) +
geom_line(aes(y = drayleigh(x, fit.params.2$estimate)),
color="blue", size = 1) +
theme_classic()
i have the following issue right now;
I want to create plots with ggplot2 where the elements panel.grid.major.x and panel.grid.major.y form squares within the plot.
My solution so far includes defining the amount of major lines from the x- and y-axis of the plot as well as the option aspect.ratio in the theme options. Following code is a MWE, my actual code right now contains more options:
library(ggplot2)
#remotes::install_github("allisonhorst/palmerpenguins")
library(palmerpenguins)
equal_breaks2 <- function(n = 3, s = 0.05, ...){
function(x){
# rescaling
d <- s * diff(range(x)) / (1+2*s)
seq(min(x)+d, max(x)-d, length=n)
}
}
# This functions comes from a great answer here
# https://stackoverflow.com/questions/28436855/change-the-number-of-breaks-using-facet-grid-in-ggplot2
n_x <- 5
n_y <- 3
ggplot(palmerpenguins::penguins, aes(x = bill_depth_mm, y= bill_length_mm)) +
geom_point(aes(colour = species, shape = sex)) +
scale_color_viridis_d() +
scale_x_continuous(breaks = equal_breaks2(n = n_x, s = 0.00), expand = c(0,0)) +
scale_y_continuous(breaks = equal_breaks2(n = n_y, s = 0.00), expand = c(0,0)) +
theme(aspect.ratio = n_y/n_x,
panel.grid.minor = element_blank()) +
coord_fixed()
This plot unfortunately does not produce exact squares from the grid lines. One has to manually adjust the aspect ratio (in this example n_y=3.7 looks pretty good).
Does anyone have an idea how to solve this, without having to adjust values manually?
Edit: I forgot to mention this in my initial request; Ideally my plot limits are the min and max value of my breaks, so i also have squares at the borders of the plot.
To get a nice scale, I used scales::pretty_breaks.
Let d_x and d_y be the step size between breaks calculated by the scale function.
Let range_x and range_y be the x and y range of the data to plot.
To get squares, aspect.ratio should be :
d_x * range_y / ( d_y * range_x)
Try :
library(ggplot2)
library(scales)
data <- palmerpenguins::penguins
scale_x <- scales::pretty_breaks(n = 5)(data$bill_depth_mm)
scale_y <- scales::pretty_breaks(n = 3)(data$bill_length_mm)
d_x <- diff(scale_x)[1]
d_y <- diff(scale_y)[1]
range_x <- diff(range(scale_x))
range_y <- diff(range(scale_y))
ggplot(data, aes(x = bill_depth_mm, y= bill_length_mm)) +
geom_point(aes(colour = species, shape = sex)) +
scale_color_viridis_d() +
scale_x_continuous(breaks = scale_x, expand = c(0,0)) +
scale_y_continuous(breaks = scale_y, expand = c(0,0)) +
theme(aspect.ratio = d_x * range_y / ( d_y * range_x),
panel.grid.minor = element_blank()) +
coord_fixed(xlim=range(scale_x),ylim=range(scale_y))
So, with the great help of #Waldi, i came up with an automatic solution. Its totally viable to do all the calculation beforehand, but i wanted an automatic solution, within the ggplot-chain.
I created my own coord-ggproto object, which can calculate the aspect ratio from the internals in ggplot (According to the Formula of #Waldi).
CoordOwn <- ggproto("CoordOwn", CoordCartesian,
is_free = function() FALSE,
aspect = function(self, ranges) {
d_x = diff(ranges$x.major_source)[1]
d_y = diff(ranges$y.major_source)[1]
(d_x * diff(ranges$y.range)) / (d_y * diff(ranges$x.range))
}
)
coord_own <- function(ratio = 1, xlim = NULL, ylim = NULL, expand = TRUE, clip = "on") {
ggproto(NULL, CoordOwn,
limits = list(x = xlim, y = ylim),
ratio = ratio,
expand = expand,
clip = clip
)
}
Now i can change n_x and n_y however i want them to, and coord_own fixes the aspect ratio accordingly:
n_x <- 5
n_y <- 5
ggplot(palmerpenguins::penguins, aes(x = bill_depth_mm, y= bill_length_mm)) +
geom_point(aes(colour = species, shape = sex)) +
scale_color_viridis_d() +
scale_x_continuous(breaks = equal_breaks2(n = n_x, s = 0.00), expand = c(0,0)) +
scale_y_continuous(breaks = equal_breaks2(n = n_y, s = 0.00), expand = c(0,0)) +
theme(panel.grid.minor = element_blank()) +
coord_own()
I've plotted a confusion matrix (predicting 5 outcomes) in R using ggplot and scales for geom_text labeling.
The way geom_text(aes(label = percent(Freq/sum(Freq))) is written in code, it's showing Frequency of each box divided by sum of all observations, but what I want to do is get Frequency of each box divided by sum Frequency for each Reference.
In other words, instead of A,A = 15.8%,
it should be A,A = 15.8%/(0.0%+0.0%+0.0%+0.0%+15.8%%) = 100.0%
library(ggplot2)
library(scales)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
p <-
ggplot(data = as.data.frame(m$table) ,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percent(Freq/sum(Freq)))) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
The problem is that, as far as I know, ggplot is not able to do group calculation. See this recent post for similar question.
To solve your problem you should take advantage of the dplyrpackage.
This should work
library(ggplot2)
library(scales)
library(caret)
library(dplyr)
valid_actual <- as.factor(c("A","B","B","C","C","C","E","E","D","D","A","A","A","E","E","D","D","C","B"))
valid_pred <- as.factor(c("A","B","C","C","E","C","E","E","D","B","A","B","A","E","D","E","D","C","B"))
cfm <- confusionMatrix(valid_actual, valid_pred)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
data_c <- mutate(group_by(as.data.frame(m$table), Reference ), percentage =
percent(Freq/sum(Freq)))
p <-
ggplot(data = data_c,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "green") +
geom_text(aes(x = Reference, y = Prediction, label = percentage)) +
theme(legend.position = "none") +
ggtitle(mytitle)
return(p)
}
ggplotConfusionMatrix(cfm)
And the result: