using R to plot interaction plot - r

I have created a model using following
age hrs charges
530.6071 792.10 3474.60
408.6071 489.70 1247.06
108.0357 463.00 1697.07
106.6071 404.15 1676.33
669.4643 384.65 1701.13
556.4643 358.15 1630.30
665.4643 343.85 2468.83
508.4643 342.35 3366.44
106.0357 335.25 2876.82
interaction_model <- rlm( charges~age+hrs+age*hrs, age_vs_hrs_charges_cleaned);
Any idea how i can plot this in 3D?
I already plotted using
library(effects);
plot(effect(term="age:hrs", mod=interaction_model,default.levels=20),multiline=TRUE);
but this is not very clear visualization.
Any help?

There are several ways to do this.
model <- lm( charges~age+hrs+age*hrs, df)
# set up grid of (x,y) values
age <- seq(0,1000, by=20)
hrs <- seq(0,1000, by=20)
gg <- expand.grid(age=age, hrs=hrs)
# prediction from the linear model
gg$charges <-predict(model,newdata=gg)
# contour plot
library(ggplot2)
library(colorRamps)
library(grDevices)
jet.colors <- colorRampPalette(matlab.like(9))
ggplot(gg, aes(x=age, y=hrs, z=charges))+
stat_contour(aes(color=..level..),binwidth=200, size=2)+
scale_color_gradientn(colours=jet.colors(8))
# 3D scatterplot
library(scatterplot3d)
scatterplot3d(gg$age, gg$hrs, gg$charges)
# interactive 3D scatterplot (just a screen shot here)
library(rgl)
plot3d(gg$age,gg$hrs,gg$charges)
# interactive 3D surface plot with shading (screen shot)
colorjet <- jet.colors(100)
open3d()
rgl.surface(x=age, z=hrs, y=0.05*gg$charges,
color=colorzjet[ findInterval(gg$charges, seq(min(gg$charges), max(gg$charges), length=100))] )
axes3d()

A little while ago I wrote a couple of functions to display the results of a (general) linear model, together with colour coded data points, in either 3D (interactive, using rgl) or 2D (using a contour plot) :
# plot predictions of a (general) linear model as a function of two explanatory variables as an image / contour plot
# together with the actual data points
# mean value is used for any other variables in the model
plotImage=function(model=NULL,plotx=NULL,ploty=NULL,plotPoints=T,plotContours=T,plotLegend=F,npp=1000,xlab=NULL,ylab=NULL,zlab=NULL,xlim=NULL,ylim=NULL,pch=16,cex=1.2,lwd=0.1,col.palette=NULL) {
n=npp
require(rockchalk)
require(aqfig)
require(colorRamps)
require(colorspace)
require(MASS)
mf=model.frame(model);emf=rockchalk::model.data(model)
if (is.null(xlab)) xlab=plotx
if (is.null(ylab)) ylab=ploty
if (is.null(zlab)) zlab=names(mf)[[1]]
if (is.null(col.palette)) col.palette=rev(rainbow_hcl(1000,c=100))
x=emf[,plotx];y=emf[,ploty];z=mf[,1]
if (is.null(xlim)) xlim=c(min(x)*0.95,max(x)*1.05)
if (is.null(ylim)) ylim=c(min(y)*0.95,max(y)*1.05)
preds=predictOMatic(model,predVals=c(plotx,ploty),n=npp,divider="seq")
zpred=matrix(preds[,"fit"],npp,npp)
zlim=c(min(c(preds$fit,z)),max(c(preds$fit,z)))
par(mai=c(1.2,1.2,0.5,1.2),fin=c(6.5,6))
graphics::image(x=seq(xlim[1],xlim[2],len=npp),y=seq(ylim[1],ylim[2],len=npp),z=zpred,xlab=xlab,ylab=ylab,col=col.palette,useRaster=T,xaxs="i",yaxs="i")
if (plotContours) graphics::contour(x=seq(xlim[1],xlim[2],len=npp),y=seq(ylim[1],ylim[2],len=npp),z=zpred,xlab=xlab,ylab=ylab,add=T,method="edge")
if (plotPoints) {cols1=col.palette[(z-zlim[1])*999/diff(zlim)+1]
pch1=rep(pch,length(n))
cols2=adjustcolor(cols1,offset=c(-0.3,-0.3,-0.3,1))
pch2=pch-15
points(c(rbind(x,x)),c(rbind(y,y)), cex=cex,col=c(rbind(cols1,cols2)),pch=c(rbind(pch1,pch2)),lwd=lwd) }
box()
if (plotLegend) vertical.image.legend(zlim=zlim,col=col.palette) # TO DO: add z axis label, maybe make legend a bit smaller?
}
# plot predictions of a (general) linear model as a function of two explanatory variables as an interactive 3D plot
# mean value is used for any other variables in the model
plotPlaneFancy=function(model=NULL,plotx1=NULL,plotx2=NULL,plotPoints=T,plotDroplines=T,npp=50,x1lab=NULL,x2lab=NULL,ylab=NULL,x1lim=NULL,x2lim=NULL,cex=1.5,col.palette=NULL,segcol="black",segalpha=0.5,interval="none",confcol="lightgrey",confalpha=0.4,pointsalpha=1,lit=T,outfile="graph.png",aspect=c(1,1,0.3),zoom=1,userMatrix=matrix(c(0.80,-0.60,0.022,0,0.23,0.34,0.91,0,-0.55,-0.72,0.41,0,0,0,0,1),ncol=4,byrow=T),windowRect=c(0,29,1920,1032)) { # or library(colorRamps);col.palette <- matlab.like(1000)
require(rockchalk)
require(rgl)
require(colorRamps)
require(colorspace)
require(MASS)
mf=model.frame(model);emf=rockchalk::model.data(model)
if (is.null(x1lab)) x1lab=plotx1
if (is.null(x2lab)) x2lab=plotx2
if (is.null(ylab)) ylab=names(mf)[[1]]
if (is.null(col.palette)) col.palette=rev(rainbow_hcl(1000,c=100))
x1=emf[,plotx1]
x2=emf[,plotx2]
y=mf[,1]
if (is.null(x1lim)) x1lim=c(min(x1),max(x1))
if (is.null(x2lim)) x2lim=c(min(x2),max(x2))
preds=predictOMatic(model,predVals=c(plotx1,plotx2),n=npp,divider="seq",interval=interval)
ylim=c(min(c(preds$fit,y)),max(c(preds$fit,y)))
open3d(zoom=zoom,userMatrix=userMatrix,windowRect=windowRect)
if (plotPoints) plot3d(x=x1,y=x2,z=y,type="s",col=col.palette[(y-min(y))*999/diff(range(y))+1],size=cex,aspect=aspect,xlab=x1lab,ylab=x2lab,zlab=ylab,lit=lit,alpha=pointsalpha)
if (!plotPoints) plot3d(x=x1,y=x2,z=y,type="n",col=col.palette[(y-min(y))*999/diff(range(y))+1],size=cex,aspect=aspect,xlab=x1lab,ylab=x2lab,zlab=ylab)
if ("lwr" %in% names(preds)) persp3d(x=unique(preds[,plotx1]),y=unique(preds[,plotx2]),z=matrix(preds[,"lwr"],npp,npp),color=confcol, alpha=confalpha, lit=lit, back="lines",add=TRUE)
ypred=matrix(preds[,"fit"],npp,npp)
cols=col.palette[(ypred-min(ypred))*999/diff(range(ypred))+1]
persp3d(x=unique(preds[,plotx1]),y=unique(preds[,plotx2]),z=ypred,color=cols, alpha=0.7, lit=lit, back="lines",add=TRUE)
if ("upr" %in% names(preds)) persp3d(x=unique(preds[,plotx1]),y=unique(preds[,plotx2]),z=matrix(preds[,"upr"],npp,npp),color=confcol, alpha=confalpha, lit=lit, back="lines",add=TRUE)
if (plotDroplines) segments3d(x=rep(x1,each=2),y=rep(x2,each=2),z=matrix(t(cbind(y,fitted(model))),nc=1),col=segcol,lty=2,alpha=segalpha)
if (!is.null(outfile)) rgl.snapshot(outfile, fmt="png", top=TRUE)
}
Here is what you get as output with your model :
data=data.frame(age=c(530.6071,408.6071,108.0357,106.6071,669.4643,556.4643,665.4643,508.4643,106.0357),
hrs=c(792.10,489.70,463.00,404.15,384.65,358.15,343.85,342.35,335.25),
charges=c(3474.60,1247.06,1697.07,1676.33,1701.13,1630.30,2468.83,3366.44,2876.82))
library(MASS)
fit1=rlm( charges~age+hrs+age*hrs, data)
plotPlaneFancy(fit1, plotx1 = "age", plotx2 = "hrs")
plotPlaneFancy(fit1, plotx1 = "age", plotx2 = "hrs",interval="confidence")
(or interval="prediction" to show 95% prediction intervals)
plotImage(fit1,plotx="age",ploty="hrs",plotContours=T,plotLegend=T)

Related

How to generate a Scree Plot for Hierarchical Cluster in R?

I have generated a dendrogram using plot() function and used hclust() for hierarchical clustering. I am looking to generate a scree plot for the same. Any suggestions?
It's a little late, but I have an answer.
# creating a dissimilarity matrix
res.dist <- dist(USArrests, method = "euclidean")
# creating an object of class "hclust"
res.hc <- hclust(d = res.dist, method = "ward.D2")
As can be found in the documentation to hclust, it is a list of values. You can inspect them by using
View(res.hc)
Now, the variable height has exactly what is needed for a scree plot. The following code generates a scree plot:
> ggplot(res.hc$height %>%
+ as.tibble() %>%
+ add_column(groups = length(res.hc$height):1) %>%
+ rename(height=value),
+ aes(x=groups, y=height)) +
+ geom_point() +
+ geom_line()
Basically, what you do is plot the height for a number of groups.
(It might not be very elegant, I'd be delighted to hear shorter versions to generate the same outcome).
My outcome is:
library(nFactors)
ev <- eigen(cor(mydata)) # get eigenvalues
ap <- parallel(subject=nrow(mydata),var=ncol(mydata),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)
Please do take a look at the youtube link here, which will be of use
https://www.youtube.com/watch?v=aMYCFtoBrdA
Regards
code
Link to R code on google drive for download
https://drive.google.com/file/d/0Byo-GmbU7XciVGRQcTk3QzdTMjA/view?usp=sharing
R code
#-----------------------------------------------
# Hierarchical clustering with the sample data
#------------------------------------------------
# Reading data into R similar to CARDS
temp_str <- "Name physics math
P 15 20
Q 20 15
R 26 21
X 44 52
Y 50 45
Z 57 38
A 80 85
B 90 88
C 98 98"
base_data <- read.table(textConnection(
temp_str), header = TRUE)
closeAllConnections()
# Check distinct categories of Variables useing STR function
str(base_data)
# Plot data
plot(base_data$physics, base_data$math,
pch=21, bg=c("red","green3","blue","red","green3","blue",
"red","green3","blue")[unclass(base_data$Name)],
main="Base Data")
# Step 01- obtain distance matrix (right way)
my_dist <- dist(base_data[c(2,3)], method = "euclidean")
print(my_dist)
# Step 02- Apply Hierarchical Clustering
fit <- hclust(my_dist, method="ward.D2")
# Step 03- Display dendogram
plot(fit, labels = base_data$Name)
Dendogram_Height=0
for (i in 2:9) Dendogram_Height[i] <- fit$height[i-1]
plot(1:9, Dendogram_Height, type="b", xlab="Sequence of merging",
ylab="Dendogram Height")
plot(9:1, Dendogram_Height, type="b", xlab="# of clusters",
ylab="Dendogram Height")
# Step 04- draw dendogram with color borders
# One can use this step to take a look at execution
rect.hclust(fit, k=8, border="red")
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=7, border="red")
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=6, border="red")
# draw color borders around required clusterd
plot(fit, labels = base_data$Name)
rect.hclust(fit, k=3, border="blue")
# cut tree into 3 clusters
my_groups <- cutree(fit, k=3)

R plotting multiple survival curves in the same plot

I am trying to plot multiple survival curves in the same plot. Using plot I can easily do this by
plot(sr_fit_0, col = 'red' , conf.int=TRUE, xlim=c(0, max_m))
par(new=TRUE)
plot(sr_fit_1, col ='blue', conf.int=TRUE, xlim=c(0, max_m))`
But now I want to use ggsurv to plot survival curve and I don't know how to have both of them in the same plot(not subplots). Any help is appreciated.
I generated some data for life below for life of hamsters and gerbils. You can use the survfit() function similar to other curve fitting functions and define a data frame column that splits the population. When you create the plot with ggsurv() I think it will display what you are looking for.
## Make some data for varmint life
set.seed(1); l1 <- rnorm(120, 2.5, 1)
gerbils <- data.frame(life = l1[l1>0])
set.seed(3); l2 <- rnorm(120, 3, 1)
hamsters <- data.frame(life = l2[l2>0])
## Load required packages
require('survival'); require('GGally')
## Generate fits for survival curves
## (Note that Surv(x) creates a Survival Object)
sf.gerbils <- survfit(Surv(life) ~ 1, data = gerbils)
sf.hamsters <- survfit(Surv(life) ~ 1, data = hamsters)
ggsurv(sf.gerbils) #Survival plot for gerbils
ggsurv(sf.hamsters) #Survival plot for hamsters
## Combine gerbils and hamsters while adding column for identification
varmints <- rbind((cbind(gerbils, type = 'gerbil')),
(cbind(hamsters, type = 'hamster')))
## Generate survival for fit for all varmints as a function of type
sf.varmints <- survfit(Surv(life) ~ type, data = varmints)
## Plot the survival curves on one chart
ggsurv(sf.varmints)

R superimposing bivariate normal density (ellipses) on scatter plot

There are similar questions on the website, but I could not find an answer to this seemingly very simple problem. I fit a mixture of two gaussians on the Old Faithful Dataset:
if(!require("mixtools")) { install.packages("mixtools"); require("mixtools") }
data_f <- faithful
plot(data_f$waiting, data_f$eruptions)
data_f.k2 = mvnormalmixEM(as.matrix(data_f), k=2, maxit=100, epsilon=0.01)
data_f.k2$mu # estimated mean coordinates for the 2 multivariate Gaussians
data_f.k2$sigma # estimated covariance matrix
I simply want to super-impose two ellipses for the two Gaussian components of the model described by the mean vectors data_f.k2$mu and the covariance matrices data_f.k2$sigma. To get something like:
For those interested, here is the MatLab solution that created the plot above.
If you are interested in the colors as well, you can use the posterior to get the appropriate groups. I did it with ggplot2, but first I show the colored solution using #Julian's code.
# group data for coloring
data_f$group <- factor(apply(data_f.k2$posterior, 1, which.max))
# plotting
plot(data_f$eruptions, data_f$waiting, col = data_f$group)
for (i in 1: length(data_f.k2$mu)) ellipse(data_f.k2$mu[[i]],data_f.k2$sigma[[i]], col=i)
And for my version using ggplot2.
# needs ggplot2 package
require("ggplot2")
# ellipsis data
ell <- cbind(data.frame(group=factor(rep(1:length(data_f.k2$mu), each=250))),
do.call(rbind, mapply(ellipse, data_f.k2$mu, data_f.k2$sigma,
npoints=250, SIMPLIFY=FALSE)))
# plotting command
p <- ggplot(data_f, aes(color=group)) +
geom_point(aes(waiting, eruptions)) +
geom_path(data=ell, aes(x=`2`, y=`1`)) +
theme_bw(base_size=16)
print(p)
You can use the ellipse-function from package mixtools. The initial problem was that this function swaps x and y from your plot. I'll try to figure this out and update the answe. (I'll leave the colors to somebody else...)
plot( data_f$eruptions,data_f$waiting)
for (i in 1: length(data_f.k2$mu)) ellipse(data_f.k2$mu[[i]],data_f.k2$sigma[[i]])
Using mixtools internal plotting function:
plot.mixEM(data_f.k2, whichplots=2)

creating multi-panel ROC curve plots with lattice

I would like to create a plot which contains two panels, and each panel contains two ROC curve. To introduce my (failing) approach, I generate data frame containing the true label, labels for four methods (each method corresponds to)
N <- 20
TF <- rep(c(0,1),each=N/2)
pred <- method <- true <- NULL
for (imethod in 1 : 4){
pred <- c(pred,seq(-1,1,length.out=N) + rnorm(N) )
method <- c(method,rep(imethod,N))
true <- c(true,TF)
}
dat.roc <-
data.frame(true=true,pred=pred,method=method,panel=rep(1:2,each=length(method)/2))
xyplot(true ~ pred|panel, data=dat.roc,groups=method,
xlim=c(0,1),xlab="1-specificity",
ylab="sensitivity",
panel=function(x,y,...){
DD <- table(-x,y)
sens <- cumsum(DD[,2])/sum(DD[,2])
mspec <- cumsum(DD[,1])/sum(DD[,1])
panel.xyplot(mspec,sens,type="l",...)
panel.abline(0,1)
})
The plot have two panels, each of which has only ONE ROC curve (with two colors)! How can I correctly specify lattice to return two ROC curve in each panel?
Since you're using cumsum here in your panel function, you want to make sure that you are creating different plots for each group, not just each panel. One way to do this is to use the panel.superpose panel function So you would change your code to
xyplot(true ~ pred|panel, data=dat.roc,groups=method,
xlim=c(0,1),xlab="1-specificity",
ylab="sensitivity",
panel=panel.superpose,
panel.groups=function(x,y,type, ...){
DD <- table(-x,y)
sens <- cumsum(DD[,2])/sum(DD[,2])
mspec <- cumsum(DD[,1])/sum(DD[,1])
panel.xyplot(mspec,sens,type="l",...)
panel.abline(0,1)
})
which produces the plot

Plot power of a straight line not a curve

So I am using the following to script:
area <- c(1854,2001,2182,2520,4072,1627,1308,1092,854,1223,2231,1288,898,2328,1660,6018,5420,943,1625,1095,1484,929,1178,4072,2413)
weight1 <- c(24281,28474,33725,40707,76124,16263,12190,10153,8631,13690,34408,15375,8806,36245,20506,109489,104014,11308,23262,11778,20650,8771,12356,76124,28346)
weight <- weight1/1000
df <- data.frame(weight = log10(weight), area = log10(area))
fit_line <- predict(lm(area ~ weight, data=df))
fit_power <- predict(nls(area ~ i*weight^z, start=list(i=2,z=0.7), data=df))
plot(df$weight,df$area)
lines(df$weight,fit_line,col="red")
lines(sort(df$weight),sort(fit_power), col="blue")
To do a log - log plot. I can plot a straight with lm() but when I use nls() to do power fit, it plots a curve and not a straight line, see below:
How do I plot the power fit in the form of a straight line, or how can I derive it from lm(). SO that I have the answer in the form of: y = a*x^b
Your plot is not a log plot. To do a log plot:
plot(log(area)~log(weight), df)
Then to fit a line:
LM.Log <- lm(log(area)~log(weight), df)
abline(LM.Log, col="red")
And to do a curved line through a straight plot more efficiently:
Power <- coef(LM.Log)[2]
LM.Normal <- lm(area~I(weight^Power)+0, df)
plot(area~weight, df)
plot(function(x) coef(LM.Normal)*x^Power, 0, 2, add=T, col="blue")
Perhaps the following will be instructive...
df <- data.frame(weight, area, weightl = log10(weight), areal = log10(area))
df <- df[order(df$weight),]
fit_line <- predict(lm(areal ~ weightl, data=df))
fit_power <- predict(nls(area ~ i*weight^z, start=list(i=2,z=0.7), data=df))
plot(df$weightl, df$areal)
lines(df$weightl, fit_line, col="red")
lines(df$weightl, log10(fit_power), col="blue")
plot(df$weight, df$area)
lines(df$weight, 10^fit_line, col="red")
lines(df$weight, fit_power, col="blue")
I guessed, I hope correctly, that you really want a power curve through the raw values and you're taking log10 as a proxy for such. So, what you need to do is get predicted values of the raw weight / area relations and then log those and put everything on a log graph. Or get a the linear of the log values and put them both as curves on a raw graph. Examine both of the plots produced above.

Resources