loop lm predictions for plotting multiple lines - r

I want to plot linear-model-lines for each ID.
How can I create predictions for multiple lms (or glms) using sequences of different length? I tried:
#some fake data
res<-runif(60,1,20)
var<-runif(60,10,50)
ID<-rep(c("A","B","B","C","C","C"),10)
data<- data.frame(ID,res,var)
#lm
library(data.table)
dt <- data.table(data,key="ID")
fits <- lapply(unique(data$ID),
function(z)lm(res~var, data=dt[J(z),], y=T))
#sequence for each ID of length var(ID)
mins<-matrix(with(data, tapply(var,ID,min)))
mins1<-mins[,1]
maxs<-matrix(with(data,tapply(var,ID,max)))
maxs1<-maxs[,1]
my_var<-list()
for(i in 1:3){
my_var[[i]]<- seq(from=mins1[[i]],to=maxs1[[i]],by=1)
}
# predict on sequences
predslist<- list()
predslist[[i]] <- for(i in 1:3){
dat<-fits[[i]]
predict(dat,newdata= data.frame("var"= my_var,type= "response", se=TRUE))
}
predict results error

Plotting lm lines only for var[i] ranges works in ggplot:
library(ggplot2)
# create ID, x, y as coded by Matt
p <- qplot(x, y)
p + geom_smooth(aes(group=ID), method="lm", size=1, se=F)

Is something like this what you're after?
# generating some fake data
ID <- rep(letters[1:4],each=10)
x <- rnorm(40,mean=5,sd=10)
y <- as.numeric(as.factor(ID))*x + rnorm(40)
# plotting in base R
plot(x, y, col=as.factor(ID), pch=16)
# calling lm() and adding lines
lmlist <- lapply(sort(unique(ID)), function(i) lm(y[ID==i]~x[ID==i]))
for(i in 1:length(lmlist)) abline(lmlist[[i]], col=i)
Don't know if the plotting part is where you're stuck, but the abline() function will draw a least-squares line if you pass in an object returned from lm().
If you want the least-squares lines to begin & end with the min & max x values, here's a workaround. It's not pretty, but seems to work.
plot(x, y, col=as.factor(ID), pch=16)
IDnum <- as.numeric(as.factor(ID))
for(i in 1:length(lmlist)) lines(x[IDnum==i], predict(lmlist[[i]]), col=i)

Related

multiple ggplots in for loop, always same plot

So I have a dataframe tmp, and each column follows different distributions. What I want to do is plot the histograms in a pdf, each page a histogram. But why do I get three times the same histogram?
When I type in g11 I get the histogram g13, but when I plot the histograms in the pdf, instead of 3 identical pages (with histograms 1-3), I get 3 different pages with the same histogram on it.
Might it be that ggplot works with a pointer, and that due to the second for-loop, it plots g1i?
Is there a way to rewrite my code? (The example is simplified from my problem)
tmp <- data.frame(x=rnorm(n=20, mean=0, sd=1),
y=rnorm(n=20, mean=10, sd=2),
z=rnorm(n=20, mean=40, sd=5))
for (i in 1:3){
assign(paste("g1", i, sep=""),ggplot(tmp,aes(x=get(colnames(tmp)[i]))) + geom_histogram(binwidth=1))
}
pdf("/pathto/plot.pdf")
for(i in 1:3){
#i <- 1
grid.arrange(get(paste("g1", 1, sep="")), get(paste("g1", 2, sep="")), get(paste("g1", 3, sep="")))
}
dev.off()
I think the get in the first for loop is not changing the column correctly.
You could try this instead:
for (i in 1:3){
assign(paste("g1", i, sep=""),ggplot(tmp,aes_string(x=colnames(tmp)[i])) + geom_histogram(binwidth=1))
}
I would rewrite it like in the code below.
First I would put the data from wide to long format and then subset each level (x, y, z) and plot it.
library(ggplot2)
library(tidyr)
tmp <- data.frame(x=rnorm(n=20, mean=0, sd=1),
y=rnorm(n=20, mean=10, sd=2),
z=rnorm(n=20, mean=40, sd=5))
xy <- gather(tmp)
pdf("histogram.pdf")
for (i in unique(xy$key)) {
x <- droplevels(xy[xy$key == i, ])
print(
ggplot(x, aes(x = value)) +
theme_bw() +
geom_histogram()
)
}
dev.off()

Graphing a large number of plots

I'm fitting a dose-response curve to many data sets that I want to plot to a single file.
Here's how one data set looks like:
df <- data.frame(dose=c(10,0.625,2.5,0.156,0.0391,0.00244,0.00977,0.00061,10,0.625,2.5,0.156,0.0391,0.00244,0.00977,0.00061,10,0.625,2.5,0.156,0.0391,0.00244,0.00977,0.00061),viability=c(6.12,105,57.9,81.9,86.5,98.3,96.4,81.8,27.3,85.2,80.8,92,82.5,110,90.2,76.6,11.9,89,35.4,79,95.8,117,82.1,95.1),stringsAsFactors=F)
Here's the dose-response fit:
library(drc)
fit <- drm(viability~dose,data=df,fct=LL.4(names=c("Slope","Lower Limit","Upper Limit","ED50")))
Now I'm predicting values in order to plot the curve:
pred.df <- expand.grid(dose=exp(seq(log(max(df$dose)),log(min(df$dose)),length=100)))
pred <- predict(fit,newdata=pred.df,interval="confidence")
pred.df$viability <- pred[,1]
pred.df$viability.low <- pred[,2]
pred.df$viability.high <- pred[,3]
And this is how a single plot looks like:
library(ggplot2)
p <- ggplot(df,aes(x=dose,y=viability))+geom_point()+geom_ribbon(data=pred.df,aes(x=dose,y=viability,ymin=viability.low,ymax=viability.high),alpha=0.2)+labs(y="viability")+
geom_line(data=pred.df,aes(x=dose,y=viability))+coord_trans(x="log")+theme_bw()+scale_x_continuous(name="dose",breaks=sort(unique(df$dose)),labels=format(signif(sort(unique(df$dose)),3),scientific=T))+ggtitle(label="all doses")
adding a few parameter estimates to the plot:
params <- signif(summary(fit)$coefficient[-1,1],3)
names(params) <- c("lower","upper","ed50")
p <- p + annotate("text",size=3,hjust=0,x=2.4e-3,y=5,label=paste(sapply(1:length(params),function(p) paste0(names(params)[p],"=",params[p])),collapse="\n"),colour="black")
Which gives:
Now suppose I have 20 of these that I want to cram in a single figure file.
I thought that a reasonable solution would be to use grid.arrange:
As an example I'll loop 20 times on this example data set:
plot.list <- vector(mode="list",20)
for(i in 1:20){
plot.list[[i]] <- ggplot(df,aes(x=dose,y=viability))+geom_point()+geom_ribbon(data=pred.df,aes(x=dose,y=viability,ymin=viability.low,ymax=viability.high),alpha=0.2)+labs(y="viability")+
geom_line(data=pred.df,aes(x=dose,y=viability))+coord_trans(x="log")+theme_bw()+scale_x_continuous(name="dose",breaks=sort(unique(df$dose)),labels=format(signif(sort(unique(df$dose)),3),scientific=T))+ggtitle(label="all doses")+
annotate("text",size=3,hjust=0,x=2.4e-3,y=5,label=paste(sapply(1:length(params),function(p) paste0(names(params)[p],"=",params[p])),collapse="\n"),colour="black")
}
And then plot using:
library(grid)
library(gridExtra)
grid.arrange(grobs=plot.list,ncol=3,nrow=ceiling(length(plot.list)/3))
Which is obviously poorly scaled. So my question is how to create this figure with better scaling - meaning that all objects are compressed proportionally in way that produces a figure that is still visually interperable.
You should set the device size so that the plots remain readable, e.g.
pl = replicate(11, qplot(1,1), simplify = FALSE)
g = arrangeGrob(grobs = pl, ncol=3)
ggsave("plots.pdf", g, width=15, height=20)

How to create Lines connecting two points in R

Is there any way to create lines in R connecting two points?
I am aware of lines(), function, but it creates line segment what I am looking for is an infinite length line.
Here's an example of Martha's suggestion:
set.seed(1)
x <- runif(2)
y <- runif(2)
# function
segmentInf <- function(xs, ys){
fit <- lm(ys~xs)
abline(fit)
}
plot(x,y)
segmentInf(x,y)
#define x and y values for the two points
x <- rnorm(2)
y <- rnorm(2)
slope <- diff(y)/diff(x)
intercept <- y[1]-slope*x[1]
plot(x, y)
abline(intercept, slope, col="red")
# repeat the above as many times as you like to satisfy yourself
Use segment() function.
#example
x1 <- stats::runif(5)
x2 <- stats::runif(5)+2
y <- stats::rnorm(10)
plot(c(x1,x2), y)
segments(x1, y[1:5], x2, y[6:10], col= 'blue')

How do I plot the first derivative of the smoothing function?

I have the following script that emulates the type of data structure I have and analysis that I want to do on it,
library(ggplot2)
library(reshape2)
n <- 10
df <- data.frame(t=seq(n)*0.1, a =sort(rnorm(n)), b =sort(rnorm(n)),
a.1=sort(rnorm(n)), b.1=sort(rnorm(n)),
a.2=sort(rnorm(n)), b.2=sort(rnorm(n)))
head(df)
mdf <- melt(df, id=c('t'))
## head(mdf)
levels(mdf$variable) <- rep(c('a','b'),3)
g <- ggplot(mdf,aes(t,value,group=variable,colour=variable))
g +
stat_smooth(method='lm', formula = y ~ ns(x,3)) +
geom_point() +
facet_wrap(~variable) +
opts()
What I would like to do in addition to this is plot the first derivative of the smoothing function against t and against the factors, c('a','b'), as well. Any suggestions how to go about this would be greatly appreciated.
You'll have to construct the derivative yourself, and there are two possible ways for that. Let me illustrate by using only one group :
require(splines) #thx #Chase for the notice
lmdf <- mdf[mdf$variable=="b",]
model <- lm(value~ns(t,3),data=lmdf)
You then simply define your derivative as diff(Y)/diff(X) based on your predicted values, as you would do for differentiation of a discrete function. It's a very good approximation if you take enough X points.
X <- data.frame(t=seq(0.1,1.0,length=100) ) # make an ordered sequence
Y <- predict(model,newdata=X) # calculate predictions for that sequence
plot(X$t,Y,type="l",main="Original fit") #check
dY <- diff(Y)/diff(X$t) # the derivative of your function
dX <- rowMeans(embed(X$t,2)) # centers the X values for plotting
plot(dX,dY,type="l",main="Derivative") #check
As you can see, this way you obtain the points for plotting the derivative. You'll figure out from here how to apply this to both levels and combine those points to the plot you like. Below the plots from this sample code :
Here's one approach to plotting this with ggplot. There may be a more efficient way to do it, but this uses the manual calculations done by #Joris. We'll simply construct a long data.frame with all of the X and Y values while also supplying a variable to "facet" the plots:
require(ggplot2)
originalData <- data.frame(X = X$t, Y, type = "Original")
derivativeData <- data.frame(X = dX, Y = dY, type = "Derivative")
plotData <- rbind(originalData, derivativeData)
ggplot(plotData, aes(X,Y)) +
geom_line() +
facet_wrap(~type, scales = "free_y")
If data is smoothed using smooth.spline, the derivative of predicted data can be specified using the argument deriv in predict. Following from #Joris's solution
lmdf <- mdf[mdf$variable == "b",]
model <- smooth.spline(x = lmdf$t, y = lmdf$value)
Y <- predict(model, x = seq(0.1,1.0,length=100), deriv = 1) # first derivative
plot(Y$x[, 1], Y$y[, 1], type = 'l')
Any dissimilarity in the output is most likely due to differences in the smoothing.

multiple histograms on top of eachother without bins

Let's say I've got this dataframe with 2 levels. LC and HC.
Now i want to get 2 plots like below on top of eachother.
data <- data.frame(
welltype=c("LC","LC","LC","LC","LC","HC","HC","HC","HC","HC"),
value=c(1,2,1,2,1,5,4,5,4,5))
The code to get following plot =
x <- rnorm(1000)
y <- hist(x)
plot(y$breaks,
c(y$counts,0),
type="s",col="blue")
(with thanks to Joris Meys)
So, how do I even start on this. Since I'm used to java I was thinking of a for loop, but I've been told not to do it this way.
Next to the method provided by Aaron, there's a ggplot solution as well (see below),
but I would strongly advise you to use the densities, as they will give nicer plots and are a whole lot easier to construct :
# make data
wells <- c("LC","HC","BC")
Data <- data.frame(
welltype=rep(wells,each=100),
value=c(rnorm(100),rnorm(100,2),rnorm(100,3))
)
ggplot(Data,aes(value,fill=welltype)) + geom_density(alpha=0.2)
gives :
For the plot you requested :
# make hists dataframe
hists <- tapply(Data$value,Data$welltype,
function(i){
tmp <- hist(i)
data.frame(br=tmp$breaks,co=c(tmp$counts,0))
})
ll <- sapply(hists,nrow)
hists <- do.call(rbind,hists)
hists$fac <- rep(wells,ll)
# make plot
require(ggplot2)
qplot(br,co,data=hists,geom="step",colour=fac)
You can use the same code except with points instead of plot for adding additional lines to the plot.
Making up some data
set.seed(5)
d <- data.frame(x=c(rnorm(1000)+3, rnorm(1000)),
g=rep(1:2, each=1000) )
And doing it in a fairly straightforward way:
x1 <- d$x[d$g==1]
x2 <- d$x[d$g==2]
y1 <- hist(x1, plot=FALSE)
y2 <- hist(x2, plot=FALSE)
plot(y1$breaks, c(y1$counts,0), type="s",col="blue",
xlim=range(c(y1$breaks, y2$breaks)), ylim=range(c(0,y1$counts, y2$counts)))
points(y2$breaks, c(y2$counts,0), type="s", col="red")
Or in a more R-ish way:
col <- c("blue", "red")
ds <- split(d$x, d$g)
hs <- lapply(ds, hist, plot=FALSE)
plot(0,0,type="n",
ylim=range(c(0,unlist(lapply(hs, function(x) x$counts)))),
xlim=range(unlist(lapply(hs, function(x) x$breaks))) )
for(i in seq_along(hs)) {
points(hs[[i]]$breaks, c(hs[[i]]$counts,0), type="s", col=col[i])
}
EDIT: Inspired by Joris's answer, I'll note that lattice can also easily do overlapping density plots.
library(lattice)
densityplot(~x, group=g, data=d)

Resources