R: Customizing Scatterplots

R: Customizing Scatterplots - r

I am using the R programming language. I am trying to follow the answer posted in this previous stackoverflow post (scatterplot3d: regression plane with residuals) and add a "plane" to a scatterplot.
Suppose I have the following data:
my_data <- data.frame(read.table(header=TRUE,
row.names = 1,
text="
weight height age
1 2998.958 15.26611 53
2 3002.208 18.08711 52
3 3008.171 16.70896 49
4 3002.374 17.37032 55
5 3000.658 18.04860 50
6 3002.688 17.24797 45
7 3004.923 16.45360 47
8 2987.264 16.71712 47
9 3011.332 17.76626 50
10 2983.783 18.10337 42
11 3007.167 18.18355 50
12 3007.049 18.11375 53
13 3002.656 15.49990 42
14 2986.710 16.73089 47
15 2998.286 17.12075 52
"))
I adapted the code to fit my example:
library(scatterplot3d)
model_1 <- lm(age ~ weight + height, data = my_data)
# scatterplot
s3d <- scatterplot3d(my_data$height, my_data$weight, my_data$age, pch = 19, type = "p", color = "darkgrey",
main = "Regression Plane", grid = TRUE, box = FALSE,
mar = c(2.5, 2.5, 2, 1.5), angle = 55)
# regression plane
s3d$plane3d(model_1, draw_polygon = TRUE, draw_lines = TRUE,
polygon_args = list(col = rgb(.1, .2, .7, .5)))
# overlay positive residuals
wh <- resid(model_1) > 0
s3d$points3d(my_data$height, my_data$weight, my_data$age, pch = 19)
Problem: However, the "plane" appears to be absent :
Desired Result:
Can someone please show me what I am doing wrong?
Thanks

The order of height and weight caused the problem.
s3d <- scatterplot3d(my_data$weight, my_data$height,my_data$age, pch = 19, type = c("p"), color = "darkgrey",
main = "Regression Plane", grid = TRUE, box = FALSE,
mar = c(2.5, 2.5, 2, 1.5), angle = 55)
# regression plane
s3d$plane3d(model_1, draw_polygon = TRUE, draw_lines = TRUE,
polygon_args = list(col = rgb(.1, .2, .7, .5)))
# overlay positive residuals
wh <- resid(model_1) > 0
s3d$points3d(my_data$height, my_data$weight, my_data$age, pch = 19)

Related

Customize DCA plot labeling

I have a DCA object
summary(dca)
Site scores:
DCA1 DCA2 DCA3 DCA4 Totals
AH_T1 -0.083971 -0.600384 0.513428 0.794499 18
AH_T2 -0.017126 -0.410023 -0.194760 0.282255 14
AH_T3 -0.079178 -0.244031 0.274927 0.570446 16
SB_T1 -0.413546 0.033417 0.762520 0.454288 17
HZ_T1 -0.205265 -0.401048 -0.329853 0.098905 20
HZ_T2 -0.051373 -0.707477 -0.626761 -0.090690 11
HZ_T3 -0.209482 -0.284500 -0.456123 0.287953 17
IH_T1 -0.017786 0.212286 0.402395 -0.408806 18
IH_T2 -0.394654 -0.129818 0.358900 -0.429682 15
IH_T3 0.059865 -0.261604 0.568370 -0.300224 16
IH_T4 -0.159250 -0.144365 0.498412 -0.599404 12
IH_T5 -0.547112 -0.189500 0.758668 -0.303647 20
IH_T6 -0.673832 -0.467925 0.924921 -0.210702 17
OST_T1 -0.080579 -0.168180 -0.074228 -0.395828 14
RW_T1 -0.347305 -0.005233 -0.492337 -0.074018 18
RW_T2 -0.430563 -0.081961 -0.039556 -0.289489 11
RW_T3 -0.427977 -0.401413 -0.733190 0.092576 20
RW_T4 -0.437579 0.115553 -0.466525 -0.155855 13
RW_T5 -0.497717 -0.057785 -0.089060 -0.257333 13
RWB_T1 0.992864 -0.084741 -0.172626 0.254606 22
RWB_T2 0.492169 -0.203401 -0.506953 0.511755 19
RWB_T3 1.372265 0.336062 0.070865 0.019864 16
RWB_T4 0.789543 0.168187 0.703618 -0.672646 17
WM_T1 -0.112521 -0.797035 -0.372285 0.077007 14
WM_T2 0.008648 -0.546527 -0.272787 -0.099172 15
WM_T3 -0.161854 -0.781419 -0.499302 -0.475927 18
WM_T4 -0.247226 -0.792233 -0.167708 -0.112969 12
WM_T5 -0.474015 -0.822478 -0.194942 -0.321107 11
WR_T3 -0.186818 0.314175 -0.157542 -0.245089 4
WR_T4 -0.421249 0.283086 0.021062 0.081024 9
FS_T1 -0.303028 2.147182 -0.215759 0.372133 13
FS_T2 -0.450260 1.934448 -0.277489 -0.023677 7
FS_T3 -0.342402 1.536942 -0.385465 0.105576 9
FS_T4 -0.770140 1.360177 -0.163095 0.172052 11
WR_T1 -1.268393 1.374820 -0.000533 0.180091 12
WR_T2 -1.099601 0.896554 0.059584 0.372984 16
RWB_Si 4.133449 0.575623 0.163993 0.115948 20
When I try to plot the object with the base plot function, there is a lack of customization.
plot (dca, display = 'sites', type = 'p',
main = "DCA",
cols = c("black"), pch = 3, cex = 0.7)
text(dca, display = 'sites', cex=0.7, pos=2)
The plot labels overlap, is there a way to sort out this issue?
I have tried to extract the data with a function I found to a similiar issue:
ggvegan_data <- function(object, axes = c(1, 2), layers = c("species", "sites"), ...){
obj <- fortify(object, axes = axes, ...)
obj <- obj[obj$Score %in% layers, , drop = FALSE]
want <- obj$Score %in% c("species", "sites")
obj[want, , drop = FALSE]
}
and then try to use ggplot
lichen.plot.data = ggvegan_data(dca)
p <- ggplot(data = plot.data, aes(x = DCA1, DCA2, colour = Score)) +
geom_point() +
geom_text(aes(label = Label), nudge_y = 0.3)
p
But there is following error message:
Error in `fortify()`:
! `data` must be a <data.frame>, or an object coercible by `fortify()`, not an S3 object with class <decorana>.
Run `rlang::last_error()` to see where the error occurred.

I have found kind of a work around.
t2<-scores(dca)
#transorm it into a dataframe
t2<-as.data.frame(t2)
class(t2)
site.label <- c("AH_T1","AH_T2","AH_T3","SB_T1", "HZ_T1", "HZ_T2", "HZ_T3", "IH_T1","IH_T2","IH_T3","IH_T4","IH_T5","IH_T6","OST_T1","RW_T1 ",
"RW_T2","RW_T3","RW_T4","RW_T5","RWB_T1","RWB_T2","RWB_T3","RWB_T4","WM_T1 ","WM_T2","WM_T3","WM_T4","WM_T5","WR_T3","WR_T4",
"FS_T1","FS_T2","FS_T3","FS_T4","WR_T1","WR_T2","RWB_Si")
t2$label <- site.label
#plot the data with ggplot
ggplot(data = t2, aes(x = DCA1, y = DCA2)) + theme_bw() +
geom_text_repel(aes(label = label),
box.padding = unit(0.45, "lines")) +
geom_point(colour = "green", size = 3)
This gives me the ability to use data with ggplot and do different stuff with the label problem.

Seeking help to generate a similar heatmap as attached

I am trying to generate a heatmap as the following figure. I have already tried pheatmap and the code is as follows:
breaks_2 <- seq(min(0), max(2), by = 0.1)
pheatmap::pheatmap(
mat = data,
cluster_cols = F,
cluster_rows = F,
scale = "column",
border_color = "white",
color = inferno(20),
show_colnames = TRUE,
show_rownames = FALSE,
breaks = breaks_2
)
But this does not seem to work. So far I am understanding I am mistaking with defining break or have to use another package than pheatmap. Any suggestion will be really helpful.

The color scale in pheatmap adjusts to the range of the input data. If you want anything above a certain value to be coloured daffodil, then simply send pheatmap a copy of your data with the highest values rounded to 2.
Suppose you have a data frame like this, with values anywhere between 0 and 3:
set.seed(1)
data <- as.data.frame(matrix(runif(64, 0, 3), nrow = 8))
names(data) <- LETTERS[1:8]
data
#> A B C D E F G H
#> 1 0.7965260 1.8873421 2.1528555 0.801662 1.4806239 2.46283888 2.1969412 0.9488151
#> 2 1.1163717 0.1853588 2.9757183 1.158342 0.5586528 1.94118058 2.0781947 1.5559028
#> 3 1.7185601 0.6179237 1.1401055 0.040171 2.4821200 2.34879829 1.4328589 1.9860152
#> 4 2.7246234 0.5296703 2.3323357 1.147164 2.0054002 1.65910893 2.5836284 1.2204906
#> 5 0.6050458 2.0610685 2.8041157 2.609073 2.3827196 1.58915874 1.3142913 2.7386278
#> 6 2.6951691 1.1523112 0.6364276 1.021047 0.3238309 2.36806870 0.7343918 0.8808101
#> 7 2.8340258 2.3095243 1.9550213 1.446240 2.1711328 0.06999361 0.2120371 1.3771972
#> 8 1.9823934 1.4930977 0.3766653 1.798697 1.2338233 1.43169020 0.2983985 0.9971840
Some of the values are greater than two. We want all of these to appear the same colour on our heatmap, so we create a copy of our data for plotting, and round down all of the values that were greater than 2 to be exactly 2:
data_2 <- data
data_2[] <- lapply(data_2, function(x) { x[x > 2] <- 2; x })
So now if we run pheatmap on data_2, we see that all the values that were greater than 2 in our original data frame are coloured daffodil.
library(viridis)
library(pheatmap)
breaks_2 <- seq(0, 2, by = 0.1)
pheatmap(
mat = data_2,
cluster_cols = F,
cluster_rows = F,
border_color = "white",
scale = 'none',
color = inferno(22),
show_colnames = TRUE,
show_rownames = FALSE,
legend_breaks = breaks_2
)

What is wrong with my custom colour palette in this plot?

Using ggsurvplot to draw some Kaplan-Meier curves.
5 curves should be plotted and I want control over their colours.
Here is the output of the survfit being plotted:
> elective_30Decadesurv
Call: survfit(formula = elective30Surv ~ electives$Decade)
n events median 0.95LCL 0.95UCL
electives$Decade=50 14 0 NA NA NA
electives$Decade=60 173 2 NA NA NA
electives$Decade=70 442 5 NA NA NA
electives$Decade=80 168 4 NA NA NA
electives$Decade=90 2 0 NA NA NA
Here is a working plot using the default colour palette, "hue":
> ggsurvplot(elective_30Decadesurv,
data = electives,
palette = "hue",
title = "30 day survival after elective EVAR",
legend = "none",
legend.title = "Decade",
legend.labs = c("5th",
"6th",
"7th",
"8th",
"9th"
),
censor.shape = 124,
ggtheme = survPlotTheme,
risk.table = "nrisk_cumevents",
risk.table.y.text.col = TRUE,
risk.table.fontsize = 3,
risk.table.height = 0.3,
break.time.by = 5,
ylim = c(0.95,
1
),
pval = TRUE,
pval.size = 3,
pval.coord = c(1,
0.96
)
)
See plot in section 3.1.4 of this webpage for the output of the above
The Decade group has 5 entries, so I'm trying to provide five colours to palette.
However, both:
> ggsurvplot(elective_30Decadesurv,
data = electives,
palette = c("#440154",
"#3B528B",
"#21908C",
"#5DC863",
"#5DC863"
),
title = "30 day survival after elective EVAR",
legend = "none",
legend.title = "Decade",
legend.labs = c("5th",
"6th",
"7th",
"8th",
"9th"
),
censor.shape = 124,
ggtheme = survPlotTheme,
risk.table = "nrisk_cumevents",
risk.table.y.text.col = TRUE,
risk.table.fontsize = 3,
risk.table.height = 0.3,
break.time.by = 5,
ylim = c(0.95,
1
),
pval = TRUE,
pval.size = 3,
pval.coord = c(1,
0.96
)
)
And:
> fiveColours <- c("#440154",
"#3B528B",
"#21908C",
"#5DC863",
"#5DC863"
)
> ggsurvplot(elective_30Decadesurv,
data = electives,
palette = fiveColours,
title = "30 day survival after elective EVAR",
legend = "none",
legend.title = "Decade",
legend.labs = c("5th",
"6th",
"7th",
"8th",
"9th"
),
censor.shape = 124,
ggtheme = survPlotTheme,
risk.table = "nrisk_cumevents",
risk.table.y.text.col = TRUE,
risk.table.fontsize = 3,
risk.table.height = 0.3,
break.time.by = 5,
ylim = c(0.95,
1
),
pval = TRUE,
pval.size = 3,
pval.coord = c(1,
0.96
)
)
Give the same error:
Error in names(.cols) <- grp.levels :
'names' attribute [5] must be the same length as the vector [4]
What vector is length [4]?
Is 'names' attribute my colour vector?
If I take one of the colours out of the custom palette, eg fiveColours <- c("#440154","#3B528B","#21908C","#5DC863") I get this error:
Error: Insufficient values in manual scale. 5 needed but only 4 provided.
Which implies the number of colours provided is correct but something else is causing the issue.
I've troubleshot to the limits of my own ability. Help please!
FYI:
> electives %>% select(Decade) %>% group_by(Decade) %>% summarise(n())
# A tibble: 5 x 2
Decade `n()`
<fct> <int>
1 50 14
2 60 173
3 70 442
4 80 168
5 90 2
Should prove the length of the Decade variable and here is how the survival object and survfit were generated:
> elective5Surv <- Surv(electives$surv5Y, electives$dead5Y)
> elective_5Decadesurv <- survfit(elective5Surv ~ electives$Decade)

Ok, I have sorted my own mistake by proof-reading!
Of the five hex colours I’d provided, two were identical (not on purpose.)
I changed the fifth colour to a different hex value (what it was meant to be in the first place) and it works now.
Thanks, Rui, for your response earlier, it helped me down the path!

Plot values are not being generated by my code using R studio, plotly in R, and 'parcoords'

I'm having trouble with plotly in R and 'parcoords'. I'm trying to plot using colorscale defined by Persona. Persona has values of 1 through 4 and I expect each number to have it's own color. The plot scales fine but there are no lines representing the values for each variable.
Here is the code
options(viewer=NULL)
p <- df %>%
plot_ly(type = 'parcoords',
line = list(color = ~Persona,
colorscale = list(c(0,'red'),c(0.5,'green'),c(1,'blue'),c(1.5,'yellow'))) ,
dimensions = list(
list(range = c(15,55),
label = 'Rescuer Count', values = ~RescuerCount),
list(range = c(15,50),
label = 'Rescuer Share', values = ~RescuerShare),
list(range = c(5,95),
label = 'Avg Serviced Zip Codes', values = ~AvgServZips),
list(range = c(10,925),
label = 'Avg Number of Rescues', values = ~ AAvgNumofRescues),
list(range = c(310,16000),
label = 'Avg Rescuer Earnings', values = ~ AAEarnings),
list(range = c(1,55),
label = 'Persona Share of Earnings', values = ~ EarnShare),
list(range = c(30,95),
label = ' Login Percentage', values = ~ LoginPrct),
list(range = c(7,95),
label = 'Prct of Login Days W/Offer', values = ~ PrctLoginDaysWO),
list(range = c(1,5),
label = 'Avg Acceptance Rate', values = ~ AvgAcceptRate),
list(range = c(150,1975),
label = 'Annualized Number of Offers', values = ~ ANumofOffers)
)
)
print(p)
Data Table is here
Persona RescuerCount RescuerShare AvgServZips AAvgNumofRescues AAEarnings EarnShare LoginPrct PrctLoginDaysWO AvgAcceptRate ANumofOffers
1 16 15 45 389 6706 27 71 91 30 1314
2 13 15 90 915 15805 51 91 94 47 1954
3 30 27 28 147 2429 18 55 86 22 679
4 51 46 6 20 319 4 34 75 13 152
resulting plot
Please Help

plot an item map (based on difficulties)

I have a data set of item difficulties that correspond to items on a questionnaire that looks like this:
## item difficulty
## 1 ITEM_01_A 2.31179818
## 2 ITEM_02_B 1.95215238
## 3 ITEM_03_C 1.93479536
## 4 ITEM_04_D 1.62610855
## 5 ITEM_05_E 1.62188759
## 6 ITEM_06_F 1.45137544
## 7 ITEM_07_G 0.94255210
## 8 ITEM_08_H 0.89941812
## 9 ITEM_09_I 0.72752197
## 10 ITEM_10_J 0.61792597
## 11 ITEM_11_K 0.61288399
## 12 ITEM_12_L 0.39947791
## 13 ITEM_13_M 0.32209970
## 14 ITEM_14_N 0.31707701
## 15 ITEM_15_O 0.20902108
## 16 ITEM_16_P 0.19923607
## 17 ITEM_17_Q 0.06023317
## 18 ITEM_18_R -0.31155481
## 19 ITEM_19_S -0.67777282
## 20 ITEM_20_T -1.15013758
I want to make an item map of these items that looks similar (not exactly) to this (I created this in word but it lacks true scaling as I just eyeballed the scale). It's not really a traditional statistical graphic and so I don't really know how to approach this. I don't care what graphics system this is done in but I am more familiar with ggplot2 and base.
I would greatly appreciate a method of plotting this sort of unusual plot.
Here's the data set (I'm including it as I was having difficulty using read.table on the dataframe above):
DF <- structure(list(item = c("ITEM_01_A", "ITEM_02_B", "ITEM_03_C",
"ITEM_04_D", "ITEM_05_E", "ITEM_06_F", "ITEM_07_G", "ITEM_08_H",
"ITEM_09_I", "ITEM_10_J", "ITEM_11_K", "ITEM_12_L", "ITEM_13_M",
"ITEM_14_N", "ITEM_15_O", "ITEM_16_P", "ITEM_17_Q", "ITEM_18_R",
"ITEM_19_S", "ITEM_20_T"), difficulty = c(2.31179818110545, 1.95215237740899,
1.93479536058926, 1.62610855327073, 1.62188759115818, 1.45137543733965,
0.942552101641177, 0.899418119889782, 0.7275219669431, 0.617925967008653,
0.612883990709181, 0.399477905189577, 0.322099696946661, 0.31707700560997,
0.209021078266059, 0.199236065264793, 0.0602331732900628, -0.311554806052955,
-0.677772822413495, -1.15013757942119)), .Names = c("item", "difficulty"
), row.names = c(NA, -20L), class = "data.frame")
Thank you in advance.

Here is a quick example:
ggplot(DF, aes(x=1, y=difficulty, label = item)) +
geom_text(size = 3) +
scale_y_continuous(breaks = DF$difficulty, minor_breaks = NULL, labels = sprintf("%.02f", DF$difficulty)) +
scale_x_continuous(breaks = NULL) +
opts(panel.grid.major = theme_blank())
but sometimes two items are too narrow so overlapped. You may do like this:
m <- 0.1
nd <- diff(rev(DF$difficulty))
nd <- c(0, cumsum(ifelse(nd < m, m, nd)))
DF$nd <- rev(rev(DF$difficulty)[1] + nd)
ggplot(DF, aes(x=1, y=nd, label = item)) +
geom_text(size = 3) +
scale_y_continuous(breaks = DF$nd, labels = sprintf("%.02f", DF$difficulty), DF$difficulty, minor_breaks = NULL) +
scale_x_continuous(breaks = NULL) +
opts(panel.grid.major = theme_blank())

Here is a solution with base graphics.
# Compute the position of the labels to limit overlaps:
# move them as little as possible, but keep them
# at least .1 units apart.
library(quadprog)
spread <- function(b, eps=.1) {
stopifnot(b == sort(b))
n <- length(b)
Dmat <- diag(n)
dvec <- b
Amat <- matrix(0,nr=n,nc=n-1)
Amat[cbind(1:(n-1), 1:(n-1))] <- -1
Amat[cbind(2:n, 1:(n-1))] <- 1
bvec <- rep(eps,n-1)
r <- solve.QP(Dmat, dvec, Amat, bvec)
r$solution
}
DF <- DF[ order(DF$difficulty), ]
DF$position <- spread(DF$difficulty, .1)
ylim <- range(DF$difficulty)
plot( NA,
xlim = c(.5,2),
ylim = ylim + .1*c(-1,1)*diff(ylim),
axes=FALSE, xlab="", ylab=""
)
text(.9, DF$position, labels=round(DF$difficulty,3), adj=c(1,0))
text(1.1, DF$position, labels=DF$item, adj=c(0,0))
arrows(1,min(DF$position),1,max(DF$position),code=3)
text(1,min(DF$position),labels="Easier",adj=c(.5,2))
text(1,max(DF$position),labels="More difficult",adj=c(.5,-1))
text(.9, max(DF$position),labels="Difficulty",adj=c(1,-2))
text(1.1,max(DF$position),labels="Item", adj=c(0,-2))

My own attempt but I think I'm going to like Vincent's solution much better as it looks like my original specification.
DF <- DF[order(DF$difficulty), ]
par(mar=c(1, 1, 3, 0)+.4)
plot(rep(1:2, each=10), DF$difficulty, main = "Item Map ",
ylim = c(max(DF$difficulty)+1, min(DF$difficulty)-.2),
type = "n", xlab="", ylab="", axes=F, xaxs="i")
text(rep(1.55, 20), rev(DF$difficulty[c(T, F)]),
DF$item[c(F, T)], cex=.5, pos = 4)
text(rep(1, 20), rev(DF$difficulty[c(F, T)]),
DF$item[c(T, F)], cex=.5, pos = 4)
par(mar=c(0, 0, 0,0))
arrows(1.45, 2.45, 1.45, -1.29, .1, code=3)
text(rep(1.52, 20), DF$difficulty[c(T, F)],
rev(round(DF$difficulty, 2))[c(T, F)], cex=.5, pos = 2)
text(rep(1.44, 20), DF$difficulty[c(F, T)],
rev(round(DF$difficulty, 2))[c(F, T)], cex=.5, pos = 2)
text(1.455, .5, "DIFFICULTY", cex=1, srt = -90)
text(1.45, -1.375, "More Difficult", cex=.6)
text(1.45, 2.5, "Easier", cex=.6)
par(mar=c(0, 0, 0,0))