Facets and multiple datasets in ggplot2 - r

I need to display two datasets on the same faceted plots with ggplot2. The first dataset (dat) is to be shown as crosses like this:
While the second dataset (dat2) is to be shown as a color line. For an element of context, the second dataset is actually the Pareto frontier of the first set...
Both datasets (dat and dat2) look like this:
modu mnc eff
1 0.3080473 0 0.4420544
2 0.3110355 4 0.4633741
3 0.3334024 9 0.4653061
Here's my code so far:
library(ggplot2)
dat <- structure(list(modu = c(0.30947265625, 0.3094921875, 0.32958984375,
0.33974609375, 0.33767578125, 0.3243359375, 0.33513671875, 0.3076171875,
0.3203125, 0.3205078125, 0.3220703125, 0.28994140625, 0.31181640625,
0.352421875, 0.31978515625, 0.29642578125, 0.34982421875, 0.3289453125,
0.30802734375, 0.31185546875, 0.3472265625, 0.303828125, 0.32279296875,
0.3165234375, 0.311328125, 0.33640625, 0.3140234375, 0.33515625,
0.34314453125, 0.33869140625), mnc = c(15, 9, 6, 0, 10, 12, 14,
9, 5, 11, 0, 15, 0, 2, 14, 13, 14, 17, 11, 12, 13, 6, 4, 0, 13,
7, 10, 12, 7, 13), eff = c(0.492448979591836, 0.49687074829932,
0.49421768707483, 0.478571428571428, 0.493537414965986, 0.493809523809524,
0.49891156462585, 0.499319727891156, 0.495102040816327, 0.492285714285714,
0.482312925170068, 0.498911564625851, 0.479931972789116, 0.492857142857143,
0.495238095238095, 0.49891156462585, 0.49530612244898, 0.495850340136055,
0.50156462585034, 0.496, 0.492897959183673, 0.487959183673469,
0.495605442176871, 0.47795918367347, 0.501360544217687, 0.497850340136054,
0.493496598639456, 0.493741496598639, 0.496734693877551, 0.499659863945578
)), .Names = c("modu", "mnc", "eff"), row.names = c(NA, 30L), class = "data.frame")
dat2 <- structure(list(modu = c(0.26541015625, 0.282734375, 0.28541015625,
0.29216796875, 0.293671875), mnc = c(0.16, 0.28, 0.28, 0.28,
0.28), eff = c(0.503877551020408, 0.504149659863946, 0.504625850340136,
0.505714285714286, 0.508503401360544)), .Names = c("modu", "mnc",
"eff"), row.names = c(NA, 5L), class = "data.frame")
dat$modu = dat$modu
dat$mnc = dat$mnc*50
dat$eff = dat$eff
dat2$modu = dat2$modu
dat2$mnc = dat2$mnc*50
dat2$eff = dat2$eff
res <- do.call(rbind, combn(1:3, 2, function(ii)
cbind(setNames(dat[,c(ii, setdiff(1:3, ii))], c("x", "y")),
var=paste(names(dat)[ii], collapse="/")), simplify=F))
ggplot(res, aes(x=x, y=y))+ geom_point(shape=4) +
facet_wrap(~ var, scales="free")
How should I go about doing this? Do I need to add a layer? If so, how to do this in a faceted plot?
Thanks!

Here's one way:
pts <- do.call(rbind, combn(1:3, 2, function(ii)
cbind(setNames(dat[,c(ii, setdiff(1:3, ii))], c("x", "y")),
var=paste(names(dat)[ii], collapse="/")), simplify=F))
lns <- do.call(rbind, combn(1:3, 2, function(ii)
cbind(setNames(dat2[,c(ii, setdiff(1:3, ii))], c("x", "y")),
var=paste(names(dat2)[ii], collapse="/")), simplify=F))
gg.df <- rbind(cbind(geom="pt",pts),cbind(geom="ln",lns))
ggplot(gg.df,aes(x,y)) +
geom_point(data=gg.df[gg.df$geom=="pt",], shape=4)+
geom_path(data=gg.df[gg.df$geom=="ln",], color="red")+
facet_wrap(~var, scales="free")
The basic idea is to create separate data.frames for the points and the lines, then bind them together row-wise with an extra column (geom) indicating which geometry the data goes with. Then we plot the points based on the subset of gg.df with geom=="pt" and similarly with the lines.
The result isn't very interesting with your limited example, but this seems (??) to be what you want. Notice the use of geom_path(...) rather than geom_line(...). The latter orders the x-values before plotting.

Related

How do I chart a new graph for multiple columns where there are 2 series for Y

I have a dataset df1. I would like to chart a line graph with Date as X and Chart1 and T1. I would like to loop it for all column where I would get the same graph when Y is Chart2 and T2 as well as when Y is Chart3 and T3.
dput(df1) >
structure(list(Date = c(1990, 1991, 1992, 1993, 1994), Chart1 = c(25,
34, 19, 7, 4), T1 = c(23.5, 23.5, 23.5, 23.5, 23.5), Chart2 = c(2,
4, 12, 9, 15), T2 = c(10, 10, 10, 10, 10), Chart3 = c(11, 9,
8, 6, 2), T3 = c(5, 5, 5, 5, 5)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
This is the chart that I would like to get for each column (Chart2,T2) and (Chart3,T3) etc.
ggplot(df1, aes(x=Date)) + geom_line(aes(y=Chart1)) + geom_line(aes(y=T1), color="red")
You may separate out 'Chart' variables and 'T' variables and create list of plots.
library(ggplot2)
chart_cols <- grep('Chart', names(df1), value = TRUE)
t_cols <- grep('T\\d+', names(df1), value = TRUE)
list_plots <- Map(function(x, y) ggplot(df1, aes(x=Date)) +
geom_line(aes(y=.data[[x]])) +
geom_line(aes(y=.data[[y]]), color="red"), chart_cols, t_cols)

R data.table keys and column names. Harmonisation

I am trying to set keys yo a data.table and keep the original column names on the second row. All that I have tried so far changes the column names to keys and erases the original variables. I have ten data.tables to merge and all the variables have different names like in the example. So I made keys but would like to keep the originals as well before harmonisation just to be sure.
library(tidyverse)
library(lubridate)
library(forcats)
library(stringr)
library(data.table)
library(rio)
library(dplyr)
1. Keys
keys1 <- c("SDC_GENDER","SDC_CHILD_NB","LAB_CRP","PM_HIP")
keys2 <- c("SDC_GENDER","SDC_CHILD_NB","LAB_CRP","PM_HIP")
2. data.table example with variable names.
TD3 = data.table(q128 = c(1, 2, 1, 2), q129 = c(1, 5, 2, 4), q130 = c(0.8, 3.0, 10.0, NA), q131 = c(55, 56, 80, 79))
TD3
TD4 = data.table(q128 = c(1, 1, 1, 2), q129 = c(1, 3, 2, 999), q130 = c(0.9, 3.1, NA, 9.0), q131 = c(58, 60, 45, NA))
TD4
I'm not sure this is really the data structure you want to have, that is to have mixed variable types like r2evans said. However...this solution works. Just put all your little data.tables into a list and voila.
I noticed that keys1 and keys2 are identical, so I just used one of them. If they should be different keys for each they can also be listed.
keys1 <- c("SDC_GENDER","SDC_CHILD_NB","LAB_CRP","PM_HIP")
TD <- list()
TD[[1]] = data.table(q128 = c(1, 2, 1, 2), q129 = c(1, 5, 2, 4), q130 = c(0.8, 3.0, 10.0, NA), q131 = c(55, 56, 80, 79))
TD[[2]] = data.table(q128 = c(1, 1, 1, 2), q129 = c(1, 3, 2, 999), q130 = c(0.9, 3.1, NA, 9.0), q131 = c(58, 60, 45, NA))
TD <- lapply(TD, FUN = function(x){
oldcolumns <- colnames(x)
td <- data.table(
'V1' = oldcolumns[1],
'V2' = oldcolumns[2],
'V3' = oldcolumns[3],
'V4' = oldcolumns[4]
)
colnames(td) <- keys1
colnames(x) <- keys1
x <- rbind(td, x)
return(x)
})

How to automate positioning of inner labels within a stacked barplot?

I frequently have to produce stacked bar plots with labels. The way I've been coding the labels is very time intensive and I wondered if there was a way to code things more efficiently. I would like the labels to be centered on each section of the bars. I'd prefer base R solutions.
stemdata <- structure(list( #had to round some nums below for 100% bar
A = c(7, 17, 76),
B = c(14, 10, 76),
C = c( 14, 17, 69),
D = c( 4, 10, 86),
E = c( 7, 17, 76),
F = c(4, 10, 86)),
.Names = c("Food, travel, accommodations, and procedures",
"Travel itinerary and dates",
"Location of the STEM Tour stops",
"Interactions with presenters/guides",
"Duration of each STEM Tour stop",
"Overall quality of the STEM Tour"
),
class = "data.frame",
row.names = c(NA, -3L)) #4L=number of numbers in each letter vector#
# attach(stemdata)
print(stemdata)
par(mar=c(0, 19, 1, 2.1)) # this sets margins to allow long labels
barplot(as.matrix(stemdata),
beside = F, ylim = range(0, 10), xlim = range(0, 100),
horiz = T, col=colors, main="N=29",
border=F, las=1, xaxt='n', width = 1.03)
text(7, 2, "14%")
text(19, 2, "10%")
text(62, 2, "76%")
text(7, 3.2, "14%")
text(22.5, 3.2, "17%")
text(65.5, 3.2, "69%")
text(8, 4.4, "10%")
text(55, 4.4, "86%")
text(3.5, 5.6, "7%")
text(15, 5.6, "17%")
text(62, 5.6, "76%")
text(9, 6.9, "10%")
text(55, 6.9, "86%")
Staying base R as OP requested, we can easily automate the inner label positioning (i.e. x coordinates) within a small function.
xFun <- function(x) x/2 + c(0, cumsum(x)[-length(x)])
Now, it's good to know that barplot invisibly trows the y coordinates, we can catch them by assignment (here byc <- barplot(.)).
Eventually, just assemble coordinates and labels in data frame labs and "loop" through the text calls in a sapply. (Use col="white" or col=0 for white labels as wished in the other question.)
# barplot
colors <- c("gold", "orange", "red")
par(mar=c(2, 19, 4, 2) + 0.1) # expand margins
byc <- barplot(as.matrix(stemdata), horiz=TRUE, col=colors, main="N=29", # assign `byc`
border=FALSE, las=1, xaxt='n')
# labels
labs <- data.frame(x=as.vector(sapply(stemdata, xFun)), # apply `xFun` here
y=rep(byc, each=nrow(stemdata)), # use `byc` here
labels=as.vector(apply(stemdata, 1:2, paste0, "%")),
stringsAsFactors=FALSE)
invisible(sapply(seq(nrow(labs)), function(x) # `invisible` prevents unneeded console output
text(x=labs[x, 1:2], labels=labs[x, 3], cex=.9, font=2, col=0)))
# legend (set `xpd=TRUE` to plot beyond margins!)
legend(-55, 8.5, legend=c("Medium","High", "Very High"), col=colors, pch=15, xpd=TRUE)
par(mar=c(5, 4, 4, 2) + 0.1) # finally better reset par to default
Result
Data
stemdata <- structure(list(`Food, travel, accommodations, and procedures` = c(7,
17, 76), `Travel itinerary and dates` = c(14, 10, 76), `Location of the STEM Tour stops` = c(14,
17, 69), `Interactions with presenters/guides` = c(4, 10, 86),
`Duration of each STEM Tour stop` = c(7, 17, 76), `Overall quality of the STEM Tour` = c(4,
10, 86)), class = "data.frame", row.names = c(NA, -3L))
Would you consider a tidyverse solution?
library(tidyverse) # for dplyr, tidyr, tibble & ggplot2
stemdata %>%
rownames_to_column(var = "id") %>%
gather(Var, Val, -id) %>%
group_by(Var) %>%
mutate(id = factor(id, levels = 3:1)) %>%
ggplot(aes(Var, Val)) +
geom_col(aes(fill = id)) +
coord_flip() +
geom_text(aes(label = paste0(Val, "%")),
position = position_stack(0.5))
Result:

Leave one out cross validation by leaving out two ID during the training process

I have a dataframe df
df<-structure(list(ID = c(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
6, 6, 6, 6, 8, 8, 8, 9, 9), Y = c(2268.14043972082, 2147.62290922552,
2269.1387550775, 2247.31983098201, 1903.39138268307, 2174.78291538358,
2359.51909126411, 2488.39004804939, 212.851575751527, 461.398994384333,
567.150629704352, 781.775113821961, 918.303706148872, 1107.37695799186,
1160.80594193377, 1412.61328924168, 1689.48879626486, 685.154353165934,
574.088067465695, 650.30821636616, 494.185166497016, 436.312162090908
), P = c(1750.51986303926, 1614.11541634798, 951.847023338079,
1119.3682884872, 1112.38984390156, 1270.65773075982, 1234.72262170166,
1338.46096616983, 1198.95775346458, 1136.69287367165, 1265.46480803983,
1364.70149818063, 1112.37006707489, 1346.49240261316, 1740.56677791104,
1410.99217295647, 1693.18871380948, 275.447173420805, 396.449789014179,
251.609239829704, 215.432550271042, 55.5336257666349), A = c(49,
50, 51, 52, 53, 54, 55, 56, 1, 2, 3, 4, 5, 14, 15, 16, 17, 163,
164, 165, 153, 154), TA = c(9.10006221322572, 7.65505467142961,
8.21480062559674, 8.09251754304318, 8.466220758789, 8.48094407814006,
8.77304120569444, 8.31727518543397, 8.14410265791868, 8.80921738865237,
9.04091478341757, 9.66233618146246, 8.77015716015164, 9.46037931956657,
9.59702379240667, 10.1739258740118, 9.39524442215692, -0.00568604734662462,
-2.12940164413048, -0.428603434930109, 1.52337963973006, -1.04714984064565
), TS = c(9.6499861763085, 7.00622420539595, 7.73511170298675,
7.68006974050443, 8.07442411510912, 8.27687965909096, 8.76025039592727,
8.3345638889156, 9.23658956753677, 8.98160722605782, 8.98234210211611,
9.57066566368204, 8.74444401914267, 8.98719629775988, 9.18169205278566,
9.98225438314085, 9.56196773059615, 5.47788158053928, 2.58106090926808,
3.22420704848299, 1.36953555753786, 0.241334267522977), R = c(11.6679680423377,
11.0166459173372, 11.1851268491296, 10.7404563561694, 12.1054055597684,
10.9551321815546, 11.1975918244469, 10.7242192465965, 10.1661703705992,
11.4840412725324, 11.1248456370953, 11.2529612597628, 10.7694642397996,
12.3300887767583, 12.0478558531771, 12.3212362249214, 11.5650773932264,
9.56070414783612, 9.61762902218185, 10.2076240621201, 11.8234628013552,
10.9184029778985)), .Names = c("ID", "Y", "P", "A", "TA", "TS",
"R"), na.action = structure(77:78, .Names = c("77", "78"), class = "omit"), row.names = c(NA,
22L), class = "data.frame")
I am currently doing a linear regression in leave one cross validation mode. In other words, during the training I remove one site for each iteration and test the model on the site left out. See below the procedure:
df$prediction <- NA
for(id in unique(df$ID)){
train.df <- df[df$ID != id,]
test.df <- df[df$ID == id, c("P", "A", "TA", "TS","R")]
lm.df<- glm(Y ~ P+A+TA+TS+R, data=train.df)
step.df<- step(lm.df, direction = "backward")
df.pred = predict(object = step.df, newdata = test.df)
df$prediction[df$ID== id] <- df.pred
}
However, I would like to remove 2 IDs for each iteration during the cross validation instead of one. Therefore, my test set will contain two IDs instead of one every time. Anyone know how I could do it?
If you change == into %in% and unique(df$ID) into split(unique(df$ID), c(1,1,2,2,3)) it seems to be working. Essentially, in each iteration you pass two ids instead of one, so the test.df set contains those two.
See this:
df$prediction <- NA
for(id in split(unique(df$ID), c(1,1,2,2,3))){
print(id)
train.df <- df[!df$ID %in% id,]
test.df <- df[df$ID %in% id, c("P", "A", "TA", "TS","R")]
lm.df<- glm(Y ~ P+A+TA+TS+R, data=train.df)
step.df<- step(lm.df, direction = "backward",trace=0)
df.pred = predict(object = step.df, newdata = test.df)
df$prediction[df$ID %in% id] <- df.pred
}
Output:
[1] 4 5
[1] 6 8
[1] 9
I have set trace to zero above so that it only prints the ids passed in the loop. As you can see you have two instead of one (apart from the last one obviously). split splits the vector unique(df$ID) in 2-element pieces which we can then use within the loop.

Show point colour according to their row position in table

I want to display a scatter plot of points from a csv table with ggplot2. The trick is that I'd like each point, or cross, to have a different colour according to their row number in the csv file (using RColorBrewer's spectral colours).
The dataset (dat) looks like this:
modu mnc eff
1 0.3080473 0 0.4420544
2 0.3110355 4 0.4633741
3 0.3334024 9 0.4653061
So I'd like row 1 to be very blue, row two to be a little less, row three to be kind of green, etc.
Here's my code so far:
library(ggplot2)
library(RColorBrewer)
dat <- structure(list(modu = c(0.30947265625, 0.3094921875, 0.32958984375,
0.33974609375, 0.33767578125, 0.3243359375, 0.33513671875, 0.3076171875,
0.3203125, 0.3205078125, 0.3220703125, 0.28994140625, 0.31181640625,
0.352421875, 0.31978515625, 0.29642578125, 0.34982421875, 0.3289453125,
0.30802734375, 0.31185546875, 0.3472265625, 0.303828125, 0.32279296875,
0.3165234375, 0.311328125, 0.33640625, 0.3140234375, 0.33515625,
0.34314453125, 0.33869140625), mnc = c(15, 9, 6, 0, 10, 12, 14,
9, 5, 11, 0, 15, 0, 2, 14, 13, 14, 17, 11, 12, 13, 6, 4, 0, 13,
7, 10, 12, 7, 13), eff = c(0.492448979591836, 0.49687074829932,
0.49421768707483, 0.478571428571428, 0.493537414965986, 0.493809523809524,
0.49891156462585, 0.499319727891156, 0.495102040816327, 0.492285714285714,
0.482312925170068, 0.498911564625851, 0.479931972789116, 0.492857142857143,
0.495238095238095, 0.49891156462585, 0.49530612244898, 0.495850340136055,
0.50156462585034, 0.496, 0.492897959183673, 0.487959183673469,
0.495605442176871, 0.47795918367347, 0.501360544217687, 0.497850340136054,
0.493496598639456, 0.493741496598639, 0.496734693877551, 0.499659863945578
)), .Names = c("modu", "mnc", "eff"), row.names = c(NA, 30L), class = "data.frame")
dat2 <- structure(list(modu = c(0.26541015625, 0.282734375, 0.28541015625,
0.29216796875, 0.293671875), mnc = c(0.16, 0.28, 0.28, 0.28,
0.28), eff = c(0.503877551020408, 0.504149659863946, 0.504625850340136,
0.505714285714286, 0.508503401360544)), .Names = c("modu", "mnc",
"eff"), row.names = c(NA, 5L), class = "data.frame")
dat$modu = dat$modu
dat$mnc = dat$mnc*50
dat$eff = dat$eff
dat2$modu = dat2$modu
dat2$mnc = dat2$mnc*50
dat2$eff = dat2$eff
res <- do.call(rbind, combn(1:3, 2, function(ii)
cbind(setNames(dat[,c(ii, setdiff(1:3, ii))], c("x", "y")),
var=paste(names(dat)[ii], collapse="/")), simplify=F))
ggplot(res, aes(x=x, y=y))+ geom_point(shape=4) +
facet_wrap(~ var, scales="free")
How should I go about doing this?
Thanks!
res <- do.call(rbind, combn(1:3, 2, function(ii)
cbind(row=seq(nrow(dat)),setNames(dat[,c(ii, setdiff(1:3, ii))], c("x", "y")),
var=paste(names(dat)[ii], collapse="/")), simplify=F))
ggplot(res, aes(x=x, y=y, color=row))+ geom_point(shape=4) +
scale_color_gradientn(colours=rev(brewer.pal(10,"Spectral")))+
facet_wrap(~ var, scales="free")

Resources