Combine scatterplot and barplot, then lapply - r

I am trying to add a scatterplot and a barplot within the same plot area with ggplot. The scatterplot should be averages of var. '1' over var.'2' for one dataset, and the barplot should be the average value of '1' over my control dataset.
My data looks like this:
> dput(lapply(ubbs6, head))
list(structure(c(96L, 96L, 100L, 88L, 93L, 100L, 61L, 61L, 70L,
40L, 58L, 70L, 7807L, 7357L, 7695L, 6400L, 6009L, 7735L), .Dim = c(6L,
3L), .Dimnames = list(NULL, c("1", "2", "3"))), structure(c(99L,
96L, 100L, 96L, 96L, 96L, 66L, 67L, 70L, 63L, 57L, 62L, 7178L,
6028L, 6124L, 6082L, 6873L, 5629L, 31L, 27L, 60L, 42L, 12L, 18L
), .Dim = c(6L, 4L), .Dimnames = list(NULL, c("1", "2",
"3", "4"))), structure(c(99L, 95L, 95L, 100L, 96L, 95L, 69L,
58L, 56L, 70L, 61L, 65L, 6067L, 6331L, 6247L, 5988L, 7538L, 6162L,
50L, 36L, 67L, 10L, 55L, 70L), .Dim = c(6L, 4L), .Dimnames = list(
NULL, c("1", "2", "3", "4"))))
Example of what I've tried so far:
aggregate(ubbs6[[2]][,'1'], list(ubbs6[[2]][,'2']), mean)
m162 <- aggregate(ubbs6[[2]][,'1'], list(ubbs6[[2]][,'2']), mean)
m163 <- aggregate(ubbs6[[3]][,'1'], list(ubbs6[[3]][,'2']), mean)
m161 <- mean(ubbs6[[1]][,'1'])
ggplot(m162, aes_(x = m162[,'Group.1'], y = m162[,'x']))+
geom_point()+
geom_smooth(method = 'lm', formula = 'y ~ sqrt (x)')
I would like to do two things:
add a barplot of one x,y value of my control set (ubbs6[[1]])
throw this into a lapply structure so I can do this for 11 similar datasets
Any help would be greatly appreciated!
**EDIT: edited out specific details that aren't needed for others to understand the code **

Saving your data in d, you can try
ggplot(as.data.frame(d[[2]]),aes(age, FPAR) ) +
coord_cartesian(ylim = c(90,100)) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y ~ sqrt (x)') +
geom_col(data=data.frame(x=max(as.data.frame(d[[2]])$age),
y=mean(as.data.frame(d[[1]])$FPAR)),
aes(x,y), inherit.aes = FALSE)
You have to use coord_cartesian to specify the y-limits and inherit.aes = FALSE. Otherwise the bar is not correctly drawn.
When you have to combine your second and third dataframe in one plot, you can try
library(tidyverse)
d %>%
.[2:3] %>%
map(as.data.frame) %>%
bind_rows(.id = "id") %>%
mutate(max = max(age),
Mean = mean(d[[1]][1])) %>%
ggplot(aes(age, FPAR, color=id)) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y ~ sqrt (x)', se=FALSE) +
geom_col(data = . %>% distinct(max, Mean),
aes(max, Mean), inherit.aes = FALSE)

Related

Plot multiple regression lines on one plot in ggplot2

Sorry if this is a repeat question but I haven't managed to find an answer yet since my data frame has to be split. I am trying to plot two regression lines on one plot, with a regression line for data in period 1 (1815-1899)and a regression line for data in period 2 (1900-2013). I have used dplyr to split the data to run the two separate regressions but can't work out how to get them on the same graph as you seem to need the data frame in the ggplot() command for it to plot the line. Can anyone help?
Thanks.
library(tidyverse)
brest<-read.csv("brest.csv",header=TRUE) ## read in csv
brest<- na.omit(brest) ## get rid of NAs
brestp1<- select(filter(brest, period == 1),c(year,slr,period)) ## Divide into periods
brestp2<- select(filter(brest, period == 2),c(year,slr,period))
fit1 <- lm(slr ~ year, data = brestp1) ## Run lms
summary(fit1)
fit2<- lm(slr ~ year, data = brestp2)
summary(fit2)
## plot graph
ggplot(brestp1, aes(x = year, y = slr)) + ### Need not only brestp1 but also brestp2
geom_point() +
stat_smooth(method = "lm",se=FALSE)+
theme_classic()
## Data
## Brest period 1
structure(list(year = 1815:1820, slr = c(6926L, 6959L, 6945L,
6965L, 6941L, 6909L), period = c(1L, 1L, 1L, 1L, 1L, 1L)), na.action = structure(c(`30` = 30L,
`31` = 31L, `32` = 32L, `33` = 33L, `34` = 34L, `35` = 35L, `36` = 36L,
`37` = 37L, `38` = 38L, `39` = 39L, `51` = 51L, `52` = 52L, `53` = 53L,
`54` = 54L, `138` = 138L, `139` = 139L, `140` = 140L, `141` = 141L,
`142` = 142L, `143` = 143L, `144` = 144L, `145` = 145L, `146` = 146L
), class = "omit"), row.names = c(NA, 6L), class = "data.frame")
##Brest period 2
structure(list(year = 1900:1905, slr = c(6936L, 6916L, 6923L,
6976L, 6931L, 6913L), period = c(2L, 2L, 2L, 2L, 2L, 2L)), na.action = structure(c(`30` = 30L,
`31` = 31L, `32` = 32L, `33` = 33L, `34` = 34L, `35` = 35L, `36` = 36L,
`37` = 37L, `38` = 38L, `39` = 39L, `51` = 51L, `52` = 52L, `53` = 53L,
`54` = 54L, `138` = 138L, `139` = 139L, `140` = 140L, `141` = 141L,
`142` = 142L, `143` = 143L, `144` = 144L, `145` = 145L, `146` = 146L
), class = "omit"), row.names = c(NA, 6L), class = "data.frame")
Use geom_smooth with separate data:
ggplot() +
geom_smooth(aes(x = year, y = slr), data = brest1,
method = "lm", se = FALSE, color = "red") +
geom_smooth(aes(x = year, y = slr), data = brest2,
method = "lm", se = FALSE, color = "blue") +
geom_point(aes(x = year, y = slr), data = brest1, color = "red") +
geom_point(aes(x = year, y = slr), data = brest2, color = "blue")

Boxplot labelling outliers returns an error using data rownames

I am trying to label the outliers in my boxplot using the text function so I can find out from which class the outliers are coming from. I've stored the rownames of my data in variable "rownames" using names(vehData) to get the row names. When I apply this however, I get an error.
ERROR: Error in which(removeOutliers1 == bxpdat$out, arr.ind = TRUE) :
'list' object cannot be coerced to type 'double'
Completely new to R programming. Completely not sure how to fix this or what I am doing wrong
Thanks in advance for any help!
library(reshape2)
vehData <-
structure(
list(
Samples = 1:6,
Comp = c(95L, 91L, 104L, 93L, 85L,
107L),
Circ = c(48L, 41L, 50L, 41L, 44L, 57L),
D.Circ = c(83L,
84L, 106L, 82L, 70L, 106L),
Rad.Ra = c(178L, 141L, 209L, 159L,
205L, 172L),
Pr.Axis.Ra = c(72L, 57L, 66L, 63L, 103L, 50L),
Max.L.Ra = c(10L,
9L, 10L, 9L, 52L, 6L),
Scat.Ra = c(162L, 149L, 207L, 144L, 149L,
255L),
Elong = c(42L, 45L, 32L, 46L, 45L, 26L),
Pr.Axis.Rect = c(20L,
19L, 23L, 19L, 19L, 28L),
Max.L.Rect = c(159L, 143L, 158L, 143L,
144L, 169L),
Sc.Var.Maxis = c(176L, 170L, 223L, 160L, 241L, 280L),
Sc.Var.maxis = c(379L, 330L, 635L, 309L, 325L, 957L),
Ra.Gyr = c(184L,
158L, 220L, 127L, 188L, 264L),
Skew.Maxis = c(70L, 72L, 73L,
63L, 127L, 85L),
Skew.maxis = c(6L, 9L, 14L, 6L, 9L, 5L),
Kurt.maxis = c(16L,
14L, 9L, 10L, 11L, 9L),
Kurt.Maxis = c(187L, 189L, 188L, 199L,
180L, 181L),
Holl.Ra = c(197L, 199L, 196L, 207L, 183L, 183L),
Class = c("van", "van", "saab", "van", "bus", "bus")
),
row.names = c(NA,
6L), class = "data.frame")
#Remove outliers
removeOutliers <- function(data) {
OutVals <- boxplot(data)$out
remOutliers <- sapply(data, function(x) x[!x %in% OutVals])
return (remOutliers)
}
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
vehClass <- vehData$Class
rownames <- names(vehData) #column names
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
bxpdat <- boxplot(removeOutliers1)
#Also tried using vehicles$Class instead of rownames but get the same error
text(bxpdat$group, bxpdat$out,
rownames[which(removeOutliers1 == bxpdat$out, arr.ind = TRUE)[,1]],
pos = 4)
The boxplot looks like this. I am trying to label the outliers based on the x axis e.g. "Comp", "Circ", "D.Circ", "Rad.Ra", "Max.L.Ra" etc.. & by vehicle class "Van", "Bus" ..
Crammed text issue when identifying class
If it is the outliers in the 2nd boxplot, it would be:
bxpdat <- boxplot(removeOutliers1)
text(bxpdat$group, bxpdat$out,
bxpdat$names[bxpdat$group],
pos = 4)
Maybe looks better like this, if you adjust the margin and flip the labels:
par(mar=c(8,3.5,3.5,3.5))
bxpdat = boxplot(removeOutliers1,las=2,cex=0.5)
text(bxpdat$group, bxpdat$out,
bxpdat$names[bxpdat$group],
pos = 4,cex=0.5)
I understood the question differently to #StupidWolf. I thought the goal was to replace points indicating outliers with the text of the vehicle class (bus, van or saab). If you simply print the variable name (e.g. Skew.maxis), then you might as well have simply plotted the outliers as points. Unless I'm missing something.
Here is code to answer the question as I understood it, for what it's worth (beginning after defining removeOutliers):
# CHANGE: Create vehClass vector before removing Class from the dataframe
vehClass <- vehData$Class
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
bxpdat <- boxplot(removeOutliers1) # use boxplot(vehData) if you plot all the outliers as points
# loop over columns
n_plot <- 1; set.seed(123) # only plot n_plot randomly-chosen outliers
for(i in 1:ncol(vehData)){
# find out which row indices were removed as outliers
diffInd <- which(vehData[[i]] %in% setdiff(vehData[[i]], removeOutliers1[[i]]))
# if none were, then don't add any outlier text
if(length(diffInd) == 0) next
print(i)
print(paste0("l:", length(diffInd)))
if(length(diffInd) > n_plot){
diffIndPlot <- sample(diffInd, n_plot, replace = FALSE)
} else diffIndPlot <- diffInd
text(x = i, y = vehData[[i]][diffIndPlot],
labels = paste0(vehClass[diffIndPlot], ": ", vehData[[i]][diffIndPlot]))
}

How could I insert a histogram into a `geom_smooth` plot?

I am trying to mimic some figures from journal papers. Here is an example from Schlenker and Roberts (2009).
I'd like to add a similar histogram to my own plot. Please see below. Is it possible to achieve this task with ggplot? Thanks.
See a dput data below. rh represents x axis and yhat1 indicates the y axis.
> dput(df.m[,c('rh','yhat1')])
structure(list(rh = c(11L, 13L, 15L, 16L, 17L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L,
47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L,
60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L,
73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L,
86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L,
99L, 100L), yhat1 = c(0.0097784, 0.111762325, 0.0887123966666667,
0.24714677, 0.079887235, 0.162714825, 0.24789043, 0.107558165,
0.182885584545455, 0.136690964444444, 0.159203683333333, 0.5156053805,
0.587034213636364, 0.233377613, 0.31531245, 0.4778449572, 0.212574774137931,
0.2274105676, 0.253733041707317, 0.560999839354839, 0.224892959444444,
0.392268151304348, 0.351498776603774, 0.366547010727273, 0.35013903469697,
0.382026272372881, 0.510611202461538, 0.391176294871795, 0.423356474328358,
0.380316089137931, 0.459821489651163, 0.388949226593407, 0.506833284166667,
0.459263999259259, 0.558535709906542, 0.745323656071429, 0.60167464606383,
0.72210854266129, 0.695203745656566, 0.638265557105263, 0.52373110503876,
0.611695133046875, 0.963833986386555, 0.803060819275362, 0.837984669112426,
0.7931166204, 0.870764136976744, 1.21005393820225, 0.862845527777778,
1.028402381125, 1.2077895633526, 1.01176334204082, 1.08139833964706,
0.90346288, 1.05871937863014, 1.27788244930233, 1.16250975336634,
1.1450916525, 1.4412301412, 1.21264826238281, 1.35417930411504,
1.18588206727273, 1.40277204710084, 1.33194569259259, 1.18413544210084,
1.22718163528571, 1.33992107226667, 1.44770425268156, 1.43974964777778,
1.26656031551351, 1.58998655363636, 1.29994566024272, 1.46398530493902,
1.26061274530055, 1.30718501225275, 1.20523443567901, 1.23789593428571,
1.34433582230769, 1.36438752851852, 1.5915544857037, 1.10979387898438,
1.31898147708661, 1.426120105, 1.52075980155738, 1.40629729460177,
0.9048366681, 1.2973945580531, 1.37696154192982)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -88L))
Hopefully this can get you started:
library(ggplot2)
breaks <- 20
maxcount <- max(table(cut(df.m$rh,breaks = 20))) + 1
ggplot(data = df.m, aes(x = rh)) +
stat_smooth(formula = y ~ x, aes(y = yhat1 * 10 + maxcount), method = "loess") +
scale_y_continuous(breaks = c(0,5), "Exposure (Days)",
sec.axis = sec_axis(~ (. - maxcount) /10,
"Log of Daily Confirmed Case Counts")) +
geom_histogram(bins = breaks, color = "black", fill = "green") +
geom_segment(aes(x = 85, xend = 85, y = 0 + maxcount, yend = Inf),
col = "red", linetype = "dashed") +
labs(x = "Relative Humidity Percentage") + theme_classic() +
theme(axis.line.y.left = element_line(color= "green"),
axis.title.y.left = element_text(hjust = 0.05, color = "green"))

Non-linear regression vs log model

library(ggplot2)
dat <- structure(list(y = c(52L, 63L, 59L, 58L, 57L, 54L, 27L, 20L, 15L, 27L, 27L, 26L, 70L, 70L, 70L, 70L, 70L, 70L, 45L, 42L, 41L, 55L, 45L, 39L, 51L,
64L, 57L, 39L, 59L, 37L, 44L, 44L, 38L, 57L, 50L, 56L, 66L, 66L, 64L, 64L, 60L, 55L, 52L, 57L, 47L, 57L, 64L, 63L, 49L, 49L,
56L, 55L, 57L, 42L, 60L, 53L, 53L, 57L, 56L, 54L, 42L, 45L, 34L, 52L, 57L, 50L, 60L, 59L, 52L, 42L, 45L, 47L, 45L, 51L, 39L,
38L, 42L, 33L, 62L, 57L, 65L, 44L, 44L, 39L, 46L, 49L, 52L, 44L, 43L, 38L),
x = c(122743L, 132300L, 146144L, 179886L, 195180L, 233605L, 1400L, 1400L, 3600L, 5000L, 14900L, 16000L, 71410L, 85450L, 106018L,
119686L, 189746L, 243171L, 536545L, 719356L, 830031L, 564546L, 677540L, 761225L, 551561L, 626799L, 68618L, 1211267L, 1276369L,
1440113L, 1153720L, 1244575L, 1328641L, 610452L, 692624L, 791953L, 4762522L, 5011232L, 5240402L, 521339L,
560098L, 608641L, 4727833L, 4990042L, 5263899L, 1987296L, 2158704L, 2350927L, 7931905L, 8628608L, 8983683L, 2947957L, 3176995L, 3263118L,
55402L, 54854L, 55050L, 52500L, 72000L, 68862L, 1158244L, 1099976L, 1019490L, 538146L, 471219L, 437954L, 863592L, 661055L,
548097L, 484450L, 442643L, 404487L, 1033728L, 925514L, 854793L, 371420L, 285257L, 260157L, 2039241L, 2150710L, 1898614L,
1175287L, 1495433L, 1569586L, 2646966L, 3330486L, 3282677L, 745784L, 858574L, 1119671L)),
class = "data.frame", row.names = c(NA, -90L))
ggplot(dat, aes(x = x, y = y)) + geom_point()
The relationship seems like a non-linear relationship. Hence I will fitted a model where I logged y and x
mod.lm <- lm(log(y) ~ log(x), data = dat)
ggplot(dat, aes(x = log(x), y = log(y))) + geom_point() + geom_smooth(method = "lm")
However, I can see that for lower values, the log-transformation results in big differences as shown by the residuals. I then moved to non linear least square method. I have not used this before but using this post
Why is nls() giving me "singular gradient matrix at initial parameter estimates" errors?
c.0 <- min(dat$y) * 0.5
model.0 <- lm(log(y - c.0) ~ x, data = dat)
start <- list(a = exp(coef(model.0)[1]), b = coef(model.0)[2], c = c.0)
model <- nls(y ~ a * exp(b * x) + c, data = dat, start = start)
Error in nls(y ~ a * exp(b * x) + c, data = dat, start = start) :
step factor 0.000488281 reduced below 'minFactor' of 0.000976562
Can anyone advise me what does this error mean and how to fit a nls model to the above data?
In your case nls get in problems as your starting values are not good and you introduced the coefficient c which is not there in the linearized form.
To fit your nls you can do it the following way, with better staring values and removing the coefficient c:
mod.glm <- glm(y ~ x, dat=dat, family=poisson(link = "log"))
start <- list(a = coef(mod.glm)[1], b = coef(mod.glm)[2])
mod.nls <- nls(y ~ exp(a + b * x), data = dat, start = start)
I would recommend to use glm, as shown above, instead of nls to find the coefficients.
If the estimate of the linearized model (mod.lm) should not have a bias you need to adjust it.
mod.lm <- lm(log(y) ~ log(x), data = dat)
mean(dat$y) #50.44444
mean(predict(mod.glm, type="response")) #50.44444
mean(predict(mod.nls)) #50.44499
mean(exp(predict(mod.lm))) #49.11622 !
f <- log(mean(dat$y) / mean(exp(predict(mod.lm)))) #bias corection for a
mean(exp(coef(mod.lm)[1] + f + coef(mod.lm)[2]*log(dat$x))) #50.44444
In case you want to get the coefficients given from James Phillips in the comments by your own, you can try:
mod.nlsJP <- nls(y ~ a * (x^(b*x)) + offset, data=dat, start=list(a=-30, b=-5e-6, offset=50))

R - barplot grouping columns

I have the following data which contains data from 7 combinations (rows) and 12 methods (columns).
structure(list(Beams = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 2L
), .Label = c("1 – 2", "1 – 2 – 3 – 4", "1 – 3", "1 – 4", "2 – 3",
"2 – 4", "3 – 4"), class = "factor"), Slope...No.weight = c(75L,
65L, 45L, 30L, 95L, 70L, 75L), Slope...W1 = c(85L, 70L, 65L,
55L, 90L, 85L, 75L), Slope...W2 = c(80L, 65L, 65L, 50L, 90L,
90L, 75L), Slope...W3 = c(80L, 75L, 75L, 65L, 90L, 95L, 80L),
Average.Time...No.Weight = c(75L, 65L, 45L, 30L, 95L, 70L,
70L), Average.Time...W1 = c(70L, 60L, 75L, 60L, 75L, 75L,
80L), Average.Time...W2 = c(65L, 40L, 65L, 50L, 75L, 85L,
70L), Average.Time...W3 = c(65L, 40L, 80L, 75L, 65L, 85L,
80L), Momentum...No.weight = c(80L, 60L, 45L, 30L, 95L, 70L,
75L), Momentum...W1 = c(85L, 75L, 60L, 55L, 95L, 90L, 80L
), Momentum...W2 = c(80L, 65L, 70L, 50L, 90L, 90L, 85L),
Momentum...W3 = c(85L, 75L, 75L, 55L, 90L, 95L, 80L)), .Names = c("Beams",
"Slope...No.weight", "Slope...W1", "Slope...W2", "Slope...W3",
"Average.Time...No.Weight", "Average.Time...W1", "Average.Time...W2",
"Average.Time...W3", "Momentum...No.weight", "Momentum...W1",
"Momentum...W2", "Momentum...W3"), class = "data.frame", row.names = c(NA,
-7L))
I would like to get a barplot like the one below:
I've tried with
library(RColorBrewer)
dat<-read.csv("phaser-p13-30dBm-100ms.csv")
names <- c("1-2","1-3","1-4","2-3","2-4","3-4","1-2-3-4")
barx <-
barplot(as.integer(dat2[,2:13]),
beside=TRUE,
col=brewer.pal(12,"Set3"),
names.arg=names,
ylim=c(0,100),
xlab='Combination of beams',
ylab='Correct detection [%]')
box()
par(xpd=TRUE)
legend("top", c("Slope - No weight","Slope - W1","Slope - W2","Slope - W3","Average Time - No weight","Average Time - W1","Average Time - W2","Average Time - W3","Momentum - No weight","Momentum - W1","Momentum - W2","Momentum - W3"), fill = brewer.pal(12,"Set3"),horiz = T)
but I got this error:
Error in barplot.default(as.integer(dat2[, 2:13]), beside = TRUE, col = brewer.pal(12, :
incorrect number of names
Could you find the error?
I've named you dataframe df here and made use of three packages. This is not a base R solution. Given your dataset format, this is the easiest way (IMO) to do this:
library(dplyr)
library(tidyr)
library(ggplot2)
df %>% # dataframe
gather(variable, value, -Beams) %>% # convert to long format excluding beams column
ggplot(aes(x=Beams, y=value, fill=variable)) + # plot the bar plot
geom_bar(stat='identity', position='dodge')
This should get you started, if you wish to use base graphics and not ggplot2:
df <- as.matrix(dat[,-1])
rownames(df) <- dat[, 1]
barplot(df, beside = TRUE, las = 2)
Use ggplot2 package and make sure that your data is neat and ordered?
something like ggplot(dataframe, aes(colour = some_factor))) + geom_bar(aes(x=Some_variable, y=Some_other_variable))
More explict statement as to how your data matches the image would be useful.

Resources