Creating 95% confidence band with ggplot - r

I'm having trouble plotting a 95% confidence band with ggplot.
Here is my code:
> my.data
scar response.rate
1 HTS 0.88
2 HTS 0.56
3 HTS 0.56
4 HTS 0.82
5 HTS 0.10
6 HTS 0.47
7 HTS 0.83
8 HTS 0.60
9 Linear 0.83
10 Linear 0.56
11 Linear 0.79
12 Linear 0.55
13 Linear 0.70
14 Linear 0.50
15 Keloid 1.00
16 Keloid 0.83
17 Keloid 1.00
18 Striae Distensae 0.33
19 Striae Distensae 0.33
ggplot(my.data, aes(scar, response.rate))+geom_point()+geom_smooth()
The output it produces:
When I use numbers for the scars, I am able to produce the following:
Any way I can produce the same graph with the scar type instead of numbers?

This method you create the CI's manually, then plot them using geom_errorbar()
library(ggplot2)
# Creating some data:
my.data = data.frame(c(rep("H",5),rep("L",5),rep("K",5),rep("S",5)), rnorm(20,1,.5) )
names(my.data) = c("scar", "response.rate")
# Standard error function
foo = function(x){sd(x)/sqrt(length(x))}
# Creating CI's manually
my.aggs = cbind(aggregate(response.rate ~ scar, data = my.data, FUN = foo),
aggregate(response.rate ~ scar, data = my.data, FUN = mean))
names(my.aggs) = c("scar","se","","means")
# Plotting
ggplot()+
geom_point(data = my.data, aes(as.factor(scar), response.rate)) +
geom_errorbar(data = my.aggs, aes(scar, ymin=means-1.96*se, ymax=means+1.96*se), width=.1)
# Alternative method that doesn't include points
library(gplots)
plotmeans(response.rate ~ scar, data = my.data)

Related

Single heatmap on two symetric matrices with different colours and scales R

I want to achieve the same end goal as this question: Create a single heatmap based on two symmetric matrices in R but to take it further than the answer currently provided.
The answer given does not explain how one would go about having different colours for the upper and lower sections of the matrix and different scales?
Here is the example dataset:
library(Matrix)
set.seed(123)
s1<-forceSymmetric(matrix(round(rnorm(25),2),5))
colnames(s1)<-LETTERS[1:5]
rownames(s1)<-LETTERS[6:10]
diag(s1)<-1
s2<-forceSymmetric(matrix(round(rbinom(25,25,0.3),2),5))
colnames(s2)<-LETTERS[1:5]
rownames(s2)<-LETTERS[6:10]
diag(s2)<-1
s1
# 5 x 5 Matrix of class "dsyMatrix"
# A B C D E
# F 1.00 1.72 1.22 1.79 -1.07
# G 1.72 1.00 0.36 0.50 -0.22
# H 1.22 0.36 1.00 -1.97 -1.03
# I 1.79 0.50 -1.97 1.00 -0.73
# J -1.07 -0.22 -1.03 -0.73 1.00
s2
# 5 x 5 Matrix of class "dsyMatrix"
# A B C D E
# F 1 6 8 7 9
# G 6 1 5 9 8
# H 8 5 1 10 9
# I 7 9 10 1 1
# J 9 8 9 1 1
The suggested answer suggests to add the two matrices together as so:
#Get upper diagonal
reverse = s1[,ncol(s1):1]
diag(reverse) = 0
reverse[lower.tri(reverse, diag = FALSE)] <- 0
upper = reverse[,ncol(reverse):1]
# Get lower diagonal
reverse1 = s2[,ncol(s2):1]
diag(reverse1) = 0
reverse1[upper.tri(reverse1, diag = FALSE)] <- 0
upper1 = reverse1[,ncol(reverse1):1]
# Add them together
merged = as.matrix(upper+upper1)
merged
A B C D E
F 1.00 1.72 1.22 1.79 0
G 1.72 1.00 0.36 0.00 8
H 1.22 0.36 0.00 10.00 9
I 1.79 0.00 10.00 1.00 1
J 0.00 8.00 9.00 1.00 1
It then suggests using heatmap(merged) - however, how would you go about having different colours and scales for the upper and lower part of the matrix?
I am happy with using any package including ggplot2 to make this work.
Thanks in advance!
You can subset the relevant parts of the matrix in the data argument of a layer, and use {ggnewscale} to assign different fill scales to different layers. The trick is to declare a fill scale before adding new_scale_fill(), otherwise the order of operations goes wrong (which usually doesn't matter a lot, but here they do).
You can then tweak every individual scale. In the example below I just tweaked the palettes, but you can also adjust limits, breaks, labels etc.
# Assuming code from question has been executed and we have a 'merged' in memory
library(ggplot2)
library(ggnewscale)
# Wide matrix to long dataframe
# Later, we'll be relying on the notion that the dimnames have been
# converted to factor variables to separate out the upper from the lower
# matrix.
df <- reshape2::melt(merged)
ggplot(df, aes(Var1, Var2)) +
# The first layer, with its own fill scale
geom_raster(
data = ~ subset(.x, as.numeric(Var1) > as.numeric(Var2)),
aes(fill = value)
) +
scale_fill_distiller(palette = "Blues") +
# Declare new fill scale for the second layer
new_scale_fill() +
geom_raster(
data = ~ subset(.x, as.numeric(Var1) < as.numeric(Var2)),
aes(fill = value)
) +
scale_fill_distiller(palette = "Reds") +
# I'm not sure what to do with the diagonal. Make it grey?
new_scale_fill() +
geom_raster(
data = ~ subset(.x, as.numeric(Var1) == as.numeric(Var2)),
aes(fill = value)
) +
scale_fill_distiller(palette = "Greys", guide = "none")
In my opinion #teunbrand's answer is what you're looking for, but another potential option is to use the ComplexHeatmap package, e.g. based on one of the examples in the docs:
library(Matrix)
set.seed(123)
s1<-forceSymmetric(matrix(round(rnorm(25),2),5))
colnames(s1)<-LETTERS[1:5]
rownames(s1)<-LETTERS[6:10]
diag(s1)<-1
s2<-forceSymmetric(matrix(round(rbinom(25,25,0.3),2),5))
colnames(s2)<-LETTERS[1:5]
rownames(s2)<-LETTERS[6:10]
diag(s2)<-1
#Get upper diagonal
reverse = s1[,ncol(s1):1]
diag(reverse) = 0
reverse[lower.tri(reverse, diag = FALSE)] <- 0
upper = reverse[,ncol(reverse):1]
# Get lower diagonal
reverse1 = s2[,ncol(s2):1]
diag(reverse1) = 0
reverse1[upper.tri(reverse1, diag = FALSE)] <- 0
upper1 = reverse1[,ncol(reverse1):1]
# Add them together
m = as.matrix(upper+upper1)
m
#> A B C D E
#> F 1.00 1.72 1.22 1.79 0
#> G 1.72 1.00 0.36 0.00 8
#> H 1.22 0.36 0.00 10.00 9
#> I 1.79 0.00 10.00 1.00 1
#> J 0.00 8.00 9.00 1.00 1
library(ComplexHeatmap)
#> Loading required package: grid
#> ========================================
#> ComplexHeatmap version 2.8.0
#> Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
#> Github page: https://github.com/jokergoo/ComplexHeatmap
#> Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
#>
#> If you use it in published research, please cite:
#> Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
#> genomic data. Bioinformatics 2016.
#>
#> The new InteractiveComplexHeatmap package can directly export static
#> complex heatmaps into an interactive Shiny app with zero effort. Have a try!
#>
#> This message can be suppressed by:
#> suppressPackageStartupMessages(library(ComplexHeatmap))
#> ========================================
library(circlize)
#> ========================================
#> circlize version 0.4.13
#> CRAN page: https://cran.r-project.org/package=circlize
#> Github page: https://github.com/jokergoo/circlize
#> Documentation: https://jokergoo.github.io/circlize_book/book/
#>
#> If you use it in published research, please cite:
#> Gu, Z. circlize implements and enhances circular visualization
#> in R. Bioinformatics 2014.
#>
#> This message can be suppressed by:
#> suppressPackageStartupMessages(library(circlize))
#> ========================================
col1 = colorRamp2(c(-1, 10), c("white", "red"))
col2 = colorRamp2(c(-1, 10), c("white", "blue3"))
# here reordering the symmetric matrix is necessary
od = hclust(dist(m))$order
m = m[od, od]
ht = Heatmap(m, rect_gp = gpar(type = "none"), show_heatmap_legend = FALSE,
cluster_rows = FALSE, cluster_columns = FALSE,
layer_fun = function(j, i, x, y, w, h, fill) {
l = i > j
grid.rect(x[l], y[l], w[l], h[l],
gp = gpar(fill = col1(pindex(m, i[l], j[l])), col = NA))
l = i < j
grid.rect(x[l], y[l], w[l], h[l],
gp = gpar(fill = col2(pindex(m, i[l], j[l])), col = NA))
})
draw(ht, heatmap_legend_list = list(
Legend(title = "Group_A", col_fun = col1),
Legend(title = "Group_B", col_fun = col2)
))
Created on 2022-03-07 by the reprex package (v2.0.1)

compare qqplot of a sample with a reference probability distribution in R

I have a weather dataset, i found a simple linear model for two columns Temperature and Humidity and plotted the histogram of its residuals and calculated the mean and std.
model <- lm(Temperature..C. ~ Humidity, data = inputData)
model.res = resid(model)
hist(model.res)
mean(model.res)
sd(model.res)
I should Plot QQ-plot of residuals versus a zero-mean normal distribution with estimated std. I used Kolmogorov-Smirnov to compare a sample with a reference probability distribution but i don't know how to plot it together:
ks<-ks.test(model.res, "pnorm", mean=0, sd=sd(model.res))
qqnorm(model.res, main="qqnorm")
qqline(model.res)
Data example:
Temperature..C. Humidity
1 9.472222 0.89
2 9.355556 0.86
3 9.377778 0.89
4 8.288889 0.83
5 8.755556 0.83
6 9.222222 0.85
7 7.733333 0.95
8 8.772222 0.89
9 10.822222 0.82
10 13.772222 0.72
11 16.016667 0.67
12 17.144444 0.54
13 17.800000 0.55
14 17.333333 0.51
15 18.877778 0.47
16 18.911111 0.46
17 15.388889 0.60
18 15.550000 0.63
19 14.255556 0.69
20 13.144444 0.70
Here is a solution using ggplot2
ggplot(model, aes(sample = rstandard(model))) +
geom_qq() +
stat_qq_line(dparams=list(sd=sd(model.res)), color="red") +
stat_qq_line()
The red line represents the qqline with your sample sd, the blackline a sd of 1.
You did not ask for that, but you could also add a smoothed qqplot:
data_model <- model
data_model$theo <- unlist(qqnorm(data_model$residuals)[1])
ggplot(data_model, aes(sample = rstandard(data_model))) +
geom_qq() +
stat_qq_line(dparams=list(sd=sd(model.res)), color="red") +
geom_smooth(aes(x=data_model$theo, y=data_model$residuals), method = "loess")

Why can't I use cv.glm on the output of bestglm?

I am trying to do best subset selection on the wine dataset, and then I want to get the test error rate using 10 fold CV. The code I used is -
cost1 <- function(good, pi=0) mean(abs(good-pi) > 0.5)
res.best.logistic <-
bestglm(Xy = winedata,
family = binomial, # binomial family for logistic
IC = "AIC", # Information criteria
method = "exhaustive")
res.best.logistic$BestModels
best.cv.err<- cv.glm(winedata,res.best.logistic$BestModel,cost1, K=10)
However, this gives the error -
Error in UseMethod("family") : no applicable method for 'family' applied to an object of class "NULL"
I thought that $BestModel is the lm-object that represents the best fit, and that's what manual also says. If that's the case, then why cant I find the test error on it using 10 fold CV, with the help of cv.glm?
The dataset used is the white wine dataset from https://archive.ics.uci.edu/ml/datasets/Wine+Quality and the package used is the boot package for cv.glm, and the bestglm package.
The data was processed as -
winedata <- read.delim("winequality-white.csv", sep = ';')
winedata$quality[winedata$quality< 7] <- "0" #recode
winedata$quality[winedata$quality>=7] <- "1" #recode
winedata$quality <- factor(winedata$quality)# Convert the column to a factor
names(winedata)[names(winedata) == "quality"] <- "good" #rename 'quality' to 'good'
bestglm fit rearranges your data and name your response variable as y, hence if you pass it back into cv.glm, winedata does not have a column y and everything crashes after that
It's always good to check what is the class:
class(res.best.logistic$BestModel)
[1] "glm" "lm"
But if you look at the call of res.best.logistic$BestModel:
res.best.logistic$BestModel$call
glm(formula = y ~ ., family = family, data = Xi, weights = weights)
head(res.best.logistic$BestModel$model)
y fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
1 0 7.0 0.27 0.36 20.7 0.045
2 0 6.3 0.30 0.34 1.6 0.049
3 0 8.1 0.28 0.40 6.9 0.050
4 0 7.2 0.23 0.32 8.5 0.058
5 0 7.2 0.23 0.32 8.5 0.058
6 0 8.1 0.28 0.40 6.9 0.050
free.sulfur.dioxide density pH sulphates
1 45 1.0010 3.00 0.45
2 14 0.9940 3.30 0.49
3 30 0.9951 3.26 0.44
4 47 0.9956 3.19 0.40
5 47 0.9956 3.19 0.40
6 30 0.9951 3.26 0.44
You can substitute things in the call etc, but it's too much of a mess. Fitting is not costly, so make a fit on winedata and pass it to cv.glm:
best_var = apply(res.best.logistic$BestModels[,-ncol(winedata)],1,which)
# take the variable names for best model
best_var = names(best_var[[1]])
new_form = as.formula(paste("good ~", paste(best_var,collapse="+")))
fit = glm(new_form,winedata,family="binomial")
best.cv.err<- cv.glm(winedata,fit,cost1, K=10)

Model multiple imputation with interaction terms

According to the documentation of the mice package, if we want to impute data when we're interested in interaction terms we need to use passive imputation. Which is done the following way.
library(mice)
nhanes2.ext <- cbind(nhanes2, bmi.chl = NA)
ini <- mice(nhanes2.ext, max = 0, print = FALSE)
meth <- ini$meth
meth["bmi.chl"] <- "~I((bmi-25)*(chl-200))"
pred <- ini$pred
pred[c("bmi", "chl"), "bmi.chl"] <- 0
imp <- mice(nhanes2.ext, meth = meth, pred = pred, seed = 51600, print = FALSE)
It is said that
Imputations created in this way preserve the interaction of bmi with chl
Here, a new variable called bmi.chl is created in the original dataset. The meth step tells how this variable needs to be imputed from the existing ones. The pred step says we don't want to predict bmi and chl from bmi.chl. But now, if we want to apply a model, how do we proceed? Is the product defined by "~I((bmi-25)*(chl-200))" is just a way to control for the imputed values of the main effects, i.e. bmi and chl?
If the model we want to fit is glm(hyp~chl*bmi, family="binomial"), what is the correct way to specify this model from the imputed data? fit1 or fit2?
fit1 <- with(data=imp, glm(hyp~chl*bmi, family="binomial"))
summary(pool(fit1))
Or do we have to use somehow the imputed values of the new variable created, i.e. bmi.chl?
fit2 <- with(data=imp, glm(hyp~chl+bmi+bmi.chl, family="binomial"))
summary(pool(fit2))
With passive imputation, it does not matter if you use the passively imputed variable, or if you re-calculate the product term in your call to glm.
The reason that fit1 and fit2 yield different results in your example is because are not just doing passive imputation for the product term.
Instead you are transforming the two variables befor multiplying (i.e., you calculate bmi-25 and chl-100). As a result, the passively imputed variable bmi.chl does not represent the product term bmi*chl but rather (bmi-25)*(chl-200).
If you just calculate the product term, then fit1 and fit2 yield the same results like they should:
library(mice)
nhanes2.ext <- cbind(nhanes2, bmi.chl = NA)
ini <- mice(nhanes2.ext, max = 0, print = FALSE)
meth <- ini$meth
meth["bmi.chl"] <- "~I(bmi*chl)"
pred <- ini$pred
pred[c("bmi", "chl"), "bmi.chl"] <- 0
pred[c("hyp"), "bmi.chl"] <- 1
imp <- mice(nhanes2.ext, meth = meth, pred = pred, seed = 51600, print = FALSE)
fit1 <- with(data=imp, glm(hyp~chl*bmi, family="binomial"))
summary(pool(fit1))
# > round(summary(pool(fit1)),2)
# est se t df Pr(>|t|) lo 95 hi 95 nmis fmi lambda
# (Intercept) -23.94 38.03 -0.63 10.23 0.54 -108.43 60.54 NA 0.41 0.30
# chl 0.10 0.18 0.58 9.71 0.58 -0.30 0.51 10 0.43 0.32
# bmi 0.70 1.41 0.49 10.25 0.63 -2.44 3.83 9 0.41 0.30
# chl:bmi 0.00 0.01 -0.47 9.67 0.65 -0.02 0.01 NA 0.43 0.33
fit2 <- with(data=imp, glm(hyp~chl+bmi+bmi.chl, family="binomial"))
summary(pool(fit2))
# > round(summary(pool(fit2)),2)
# est se t df Pr(>|t|) lo 95 hi 95 nmis fmi lambda
# (Intercept) -23.94 38.03 -0.63 10.23 0.54 -108.43 60.54 NA 0.41 0.30
# chl 0.10 0.18 0.58 9.71 0.58 -0.30 0.51 10 0.43 0.32
# bmi 0.70 1.41 0.49 10.25 0.63 -2.44 3.83 9 0.41 0.30
# bmi.chl 0.00 0.01 -0.47 9.67 0.65 -0.02 0.01 25 0.43 0.33
This is not surprising because the ~I(bmi*chl) in mice and the bmi*chl in glm do the exact same thing. They merely calculate the product of the two variables.
Remark:
Note that I added a line saying that bmi.chl should be used as a predictor when imputing hyp. Without this step, passive imputation has no purpose because the imputation model would neglect the product term, thus being incongruent with the analysis model.

Polychoric regression significance test

I have Likert type data, some 4-point scale and others 3-point scale. For example:
variable.x <- c(5,2,4,5,3,1)
variable.y <- c(0,1,1,0,2,1)
my.data <- cbind(variable.x, variable.y)
library(psych)
polychoric(my.data)
#Call: polychoric(x = my.data)
#Polychoric correlations
# vrbl.x vrbl.y
#variable.x 1.00
#variable.y -0.25 1.00
# with tau of
# 0 1 2 3 4
#variable.x -0.97 -0.43 0 0.43 Inf
#variable.y -0.43 0.97 Inf Inf Inf
How can I obtain a significance value for the correlation of -0.25?
Strangely the use of the polycor package gives totally different results:
library(polycor)
polychor(variable.x, variable.y, ML=TRUE, std.err=TRUE)
#Polychoric Correlation, ML est. = -0.7599 (0.2588)
#Test of bivariate normality: Chisquare = 9.088, df = 7, p = 0.2464
# Row Thresholds
# Threshold Std.Err.
#1 -0.9388 0.5519
#2 -0.5774 0.5040
#3 -0.1906 0.5267
#4 0.3979 0.5690
# Column Thresholds
# Threshold Std.Err.
#1 -0.3891 0.5683
#2 0.9692 0.5568
Now the correlation coefficient is -0.76 now, not -0.25. Again, how would I find its significance?

Resources