R: Color in CreateRadialPlot source - r

I am using CreateRadialPlot.R source and trying to plot this:
var.names <- c("All Flats", "No central heating", "Rooms per\nhousehold", "People per room",
"HE Qualification", "Routine/Semi-Routine\nOccupation", "2+ Car household",
"Public Transport\nto work", "Work from home")
values.a <- c(-0.1145725, -0.1824095, -0.01153078, -0.0202474, 0.05138737, -0.1557234,
0.1099018, -0.05310315, 0.0182626)
values.b <- c(0.2808439, -0.2936949, -0.1925846, 0.08910815, -0.03468011, 0.07385727,
-0.07228813, 0.1501105, -0.06800127)
group.names <- c("Blue Collar Communities", "Prospering Suburbs")
m2 <- matrix(c(values.a, values.b), nrow = 2, ncol = 9, byrow = TRUE)
group.names <- c(group.names)
df2 <- data.frame(group = group.names, m2)
colnames(df2)[2:10] <- var.names
source("http://pcwww.liv.ac.uk/~william/Geodemographic%20Classifiability/func%20CreateRadialPlot.r")
CreateRadialPlot(df2, plot.extent.x = 1.5)
I want to change the color of the lines for "Blue Collar Communities" and for "Prospering Suburbs", but the function does not have an argument for that. Would I need to create a new function? And what if I would like to set a title for the plot?

You need to adapt the function to accept colors for geom_path. Here, I added the third line in this part of the function (and the "+" the line before):
base <- base + geom_path(data=group$path,aes(x=x,y=y,group=group,colour=group),
size=group.line.width) +
scale_color_brewer(palette = "Set1",levels(group)) #added
This is the result:
CreateRadialPlot(df2, plot.extent.x = 1.5,background.circle.colour="white")
Here's the modified function if you want it:
CreateRadialPlot <- function(plot.data,
axis.labels=colnames(plot.data)[-1],
grid.min=-0.5, #10,
grid.mid=0, #50,
grid.max=0.5, #100,
centre.y=grid.min - ((1/9)*(grid.max-grid.min)),
plot.extent.x.sf=1.2,
plot.extent.y.sf=1.2,
x.centre.range=0.02*(grid.max-centre.y),
label.centre.y=FALSE,
grid.line.width=0.5,
gridline.min.linetype="longdash",
gridline.mid.linetype="longdash",
gridline.max.linetype="longdash",
gridline.min.colour="grey",
gridline.mid.colour="blue",
gridline.max.colour="grey",
grid.label.size=4,
gridline.label.offset=-0.02*(grid.max-centre.y),
label.gridline.min=TRUE,
axis.label.offset=1.15,
axis.label.size=3,
axis.line.colour="grey",
group.line.width=1,
group.point.size=4,
background.circle.colour="yellow",
background.circle.transparency=0.2,
plot.legend=if (nrow(plot.data)>1) TRUE else FALSE,
legend.title="Cluster",
legend.text.size=grid.label.size ) {
var.names <- colnames(plot.data)[-1] #'Short version of variable names
#axis.labels [if supplied] is designed to hold 'long version' of variable names
#with line-breaks indicated using \n
#caclulate total plot extent as radius of outer circle x a user-specifiable scaling factor
plot.extent.x=(grid.max+abs(centre.y))*plot.extent.x.sf
plot.extent.y=(grid.max+abs(centre.y))*plot.extent.y.sf
#Check supplied data makes sense
if (length(axis.labels) != ncol(plot.data)-1)
return("Error: 'axis.labels' contains the wrong number of axis labels")
if(min(plot.data[,-1])<centre.y)
return("Error: plot.data' contains value(s) < centre.y")
if(max(plot.data[,-1])>grid.max)
return("Error: 'plot.data' contains value(s) > grid.max")
#Declare required internal functions
CalculateGroupPath <- function(df) {
#Converts variable values into a set of radial x-y coordinates
#Code adapted from a solution posted by Tony M to
#http://stackoverflow.com/questions/9614433/creating-radar-chart-a-k-a-star-plot-spider-plot-using-ggplot2-in-r
#Args:
# df: Col 1 - group ('unique' cluster / group ID of entity)
# Col 2-n: v1.value to vn.value - values (e.g. group/cluser mean or median) of variables v1 to v.n
path <- as.factor(as.character(df[,1]))
##find increment
angles = seq(from=0, to=2*pi, by=(2*pi)/(ncol(df)-1))
##create graph data frame
graphData= data.frame(seg="", x=0,y=0)
graphData=graphData[-1,]
for(i in levels(path)){
pathData = subset(df, df[,1]==i)
for(j in c(2:ncol(df))){
#pathData[,j]= pathData[,j]
graphData=rbind(graphData, data.frame(group=i,
x=pathData[,j]*sin(angles[j-1]),
y=pathData[,j]*cos(angles[j-1])))
}
##complete the path by repeating first pair of coords in the path
graphData=rbind(graphData, data.frame(group=i,
x=pathData[,2]*sin(angles[1]),
y=pathData[,2]*cos(angles[1])))
}
#Make sure that name of first column matches that of input data (in case !="group")
colnames(graphData)[1] <- colnames(df)[1]
graphData #data frame returned by function
}
CaclulateAxisPath = function(var.names,min,max) {
#Caculates x-y coordinates for a set of radial axes (one per variable being plotted in radar plot)
#Args:
#var.names - list of variables to be plotted on radar plot
#min - MININUM value required for the plotted axes (same value will be applied to all axes)
#max - MAXIMUM value required for the plotted axes (same value will be applied to all axes)
#var.names <- c("v1","v2","v3","v4","v5")
n.vars <- length(var.names) # number of vars (axes) required
#Cacluate required number of angles (in radians)
angles <- seq(from=0, to=2*pi, by=(2*pi)/n.vars)
#calculate vectors of min and max x+y coords
min.x <- min*sin(angles)
min.y <- min*cos(angles)
max.x <- max*sin(angles)
max.y <- max*cos(angles)
#Combine into a set of uniquely numbered paths (one per variable)
axisData <- NULL
for (i in 1:n.vars) {
a <- c(i,min.x[i],min.y[i])
b <- c(i,max.x[i],max.y[i])
axisData <- rbind(axisData,a,b)
}
#Add column names + set row names = row no. to allow conversion into a data frame
colnames(axisData) <- c("axis.no","x","y")
rownames(axisData) <- seq(1:nrow(axisData))
#Return calculated axis paths
as.data.frame(axisData)
}
funcCircleCoords <- function(center = c(0,0), r = 1, npoints = 100){
#Adapted from Joran's response to http://stackoverflow.com/questions/6862742/draw-a-circle-with-ggplot2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
### Convert supplied data into plottable format
# (a) add abs(centre.y) to supplied plot data
#[creates plot centroid of 0,0 for internal use, regardless of min. value of y
# in user-supplied data]
plot.data.offset <- plot.data
plot.data.offset[,2:ncol(plot.data)]<- plot.data[,2:ncol(plot.data)]+abs(centre.y)
#print(plot.data.offset)
# (b) convert into radial coords
group <-NULL
group$path <- CalculateGroupPath(plot.data.offset)
#print(group$path)
# (c) Calculate coordinates required to plot radial variable axes
axis <- NULL
axis$path <- CaclulateAxisPath(var.names,grid.min+abs(centre.y),grid.max+abs(centre.y))
#print(axis$path)
# (d) Create file containing axis labels + associated plotting coordinates
#Labels
axis$label <- data.frame(
text=axis.labels,
x=NA,
y=NA )
#print(axis$label)
#axis label coordinates
n.vars <- length(var.names)
angles = seq(from=0, to=2*pi, by=(2*pi)/n.vars)
axis$label$x <- sapply(1:n.vars, function(i, x) {((grid.max+abs(centre.y))*axis.label.offset)*sin(angles[i])})
axis$label$y <- sapply(1:n.vars, function(i, x) {((grid.max+abs(centre.y))*axis.label.offset)*cos(angles[i])})
#print(axis$label)
# (e) Create Circular grid-lines + labels
#caclulate the cooridinates required to plot circular grid-lines for three user-specified
#y-axis values: min, mid and max [grid.min; grid.mid; grid.max]
gridline <- NULL
gridline$min$path <- funcCircleCoords(c(0,0),grid.min+abs(centre.y),npoints = 360)
gridline$mid$path <- funcCircleCoords(c(0,0),grid.mid+abs(centre.y),npoints = 360)
gridline$max$path <- funcCircleCoords(c(0,0),grid.max+abs(centre.y),npoints = 360)
#print(head(gridline$max$path))
#gridline labels
gridline$min$label <- data.frame(x=gridline.label.offset,y=grid.min+abs(centre.y),
text=as.character(grid.min))
gridline$max$label <- data.frame(x=gridline.label.offset,y=grid.max+abs(centre.y),
text=as.character(grid.max))
gridline$mid$label <- data.frame(x=gridline.label.offset,y=grid.mid+abs(centre.y),
text=as.character(grid.mid))
#print(gridline$min$label)
#print(gridline$max$label)
#print(gridline$mid$label)
### Start building up the radar plot
# Delcare 'theme_clear', with or without a plot legend as required by user
#[default = no legend if only 1 group [path] being plotted]
theme_clear <- theme_bw() +
theme(axis.text.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.border=element_blank(),
legend.key=element_rect(linetype="blank"))
if (plot.legend==FALSE) theme_clear <- theme_clear + theme(legend.position="none")
#Base-layer = axis labels + plot extent
# [need to declare plot extent as well, since the axis labels don't always
# fit within the plot area automatically calculated by ggplot, even if all
# included in first plot; and in any case the strategy followed here is to first
# plot right-justified labels for axis labels to left of Y axis for x< (-x.centre.range)],
# then centred labels for axis labels almost immediately above/below x= 0
# [abs(x) < x.centre.range]; then left-justified axis labels to right of Y axis [x>0].
# This building up the plot in layers doesn't allow ggplot to correctly
# identify plot extent when plotting first (base) layer]
#base layer = axis labels for axes to left of central y-axis [x< -(x.centre.range)]
base <- ggplot(axis$label) + xlab(NULL) + ylab(NULL) + coord_equal() +
geom_text(data=subset(axis$label,axis$label$x < (-x.centre.range)),
aes(x=x,y=y,label=text),size=axis.label.size,hjust=1) +
scale_x_continuous(limits=c(-plot.extent.x,plot.extent.x)) +
scale_y_continuous(limits=c(-plot.extent.y,plot.extent.y))
# + axis labels for any vertical axes [abs(x)<=x.centre.range]
base <- base + geom_text(data=subset(axis$label,abs(axis$label$x)<=x.centre.range),
aes(x=x,y=y,label=text),size=axis.label.size,hjust=0.5)
# + axis labels for any vertical axes [x>x.centre.range]
base <- base + geom_text(data=subset(axis$label,axis$label$x>x.centre.range),
aes(x=x,y=y,label=text),size=axis.label.size,hjust=0)
# + theme_clear [to remove grey plot background, grid lines, axis tick marks and axis text]
base <- base + theme_clear
# + background circle against which to plot radar data
base <- base + geom_polygon(data=gridline$max$path,aes(x,y),
fill=background.circle.colour,
alpha=background.circle.transparency)
# + radial axes
base <- base + geom_path(data=axis$path,aes(x=x,y=y,group=axis.no),
colour=axis.line.colour)
# ... + group (cluster) 'paths'
base <- base + geom_path(data=group$path,aes(x=x,y=y,group=group,colour=group),
size=group.line.width) +
scale_color_brewer(palette = "Set1",levels(group))
# ... + group points (cluster data)
base <- base + geom_point(data=group$path,aes(x=x,y=y,group=group,colour=group),size=group.point.size)
#... + amend Legend title
if (plot.legend==TRUE) base <- base + labs(colour=legend.title,size=legend.text.size)
# ... + circular grid-lines at 'min', 'mid' and 'max' y-axis values
base <- base + geom_path(data=gridline$min$path,aes(x=x,y=y),
lty=gridline.min.linetype,colour=gridline.min.colour,size=grid.line.width)
base <- base + geom_path(data=gridline$mid$path,aes(x=x,y=y),
lty=gridline.mid.linetype,colour=gridline.mid.colour,size=grid.line.width)
base <- base + geom_path(data=gridline$max$path,aes(x=x,y=y),
lty=gridline.max.linetype,colour=gridline.max.colour,size=grid.line.width)
# ... + grid-line labels (max; ave; min) [only add min. gridline label if required]
if (label.gridline.min==TRUE) {
base <- base + geom_text(aes(x=x,y=y,label=text),data=gridline$min$label,face="bold",size=grid.label.size, hjust=1) }
base <- base + geom_text(aes(x=x,y=y,label=text),data=gridline$mid$label,face="bold",size=grid.label.size, hjust=1)
base <- base + geom_text(aes(x=x,y=y,label=text),data=gridline$max$label,face="bold",size=grid.label.size, hjust=1)
# ... + centre.y label if required [i.e. value of y at centre of plot circle]
if (label.centre.y==TRUE) {
centre.y.label <- data.frame(x=0, y=0, text=as.character(centre.y))
base <- base + geom_text(aes(x=x,y=y,label=text),data=centre.y.label,face="bold",size=grid.label.size, hjust=0.5) }
return(base)
}

Related

Could using grid.arrange affect plot output in R?

For a function that accepts a data frame, a plot switch value, and bin size vector, I'm trying to create plots for each count-bin and each density-bin size combination for every numerical variable column when a plot switch value of 'grid' is inputted. I have it working that I get individual plots for each combination, but when I tried to put the plots for each numerical variable on a grid and I'm seeing variations in the actual plots even though as far as I can tell I didn't alter the calculations nor the plot commands.
For instance, notice that, even accounting for scale variations, y-density plots are different in the images below. Tried looking at the documentation for grid.arrange and list just to make sure I wasn't missing something, but I still can't figure it out. Did I somehow change the plot command without realizing it?
Tested with:
data(diamonds, package = "ggplot2")
test <- diamonds[1:100,]
hstPlts(test,ps='grid') #without grid
hst1(test,ps='grid') #with grid
hstPlts <- function(df,ps = 'off',bnSize=c(30)){
#hstPlts() accepts any data frame and if the ps parameter is “on” or
#“grid”, then plot a pair of blue histograms with a vertical red line at the
#mean
#for every numerical variable at each number of bins integer specified in
#the bin
#vector parameter. if the plot switch is set to “grid”, there should be a
#grid for each count-bin combination and a separate grid for
#each density-bin size combination.
#parameters:
#df : data frame
#ps : plot switch defaulted to 'off' with two additional value options -
#'on' and 'grid'
#bnSize : vector containing numeric values representing number of bins for
histograms
num_var <- df[sapply(df,is.numeric)] #extract numeric columns
for(i in 1:length(bnSize)){ #iterate through each bin size
for(j in 1:(ncol(num_var))){ #iterate through each numeric variable
if(ps=='on' | ps =='grid'){ #conditional for both 'on' and 'grid'
#values
bnWid <- (max(num_var[,j]) - min(num_var[,j]))/bnSize[i] # compute
#bin widths
vrMn <- mean(num_var[[j]]) #compute column means for red line
mean = sprintf("%8.3f ", vrMn) #set up label for mean line with
#formatted decimals
cntPlt <- ggplot(num_var, aes(x=num_var[,j])) + #plot count
#histogram with numeric variable on x-axis
geom_histogram(colour = "blue", fill = "blue", binwidth =
bnWid) +
#detail bar fill colors and histogram bin widths
geom_vline(xintercept = mean(num_var[,j]),colour="red") +
#mean line
abs(x=colnames(num_var)[j]) #label x-axis
densPlt <- cntPlt + aes(y = ..density..) + labs(y = "density")
#create corresponding density histogram
print(cntPlt)
print(densPlt)
}
if(ps == 'grid'){ #conditional for just 'grid' value
grdPlt <- ggplot(num_var, aes(x=num_var[,j])) + #create plot for
#each count-bin combination and a separate grid for
#each density-bin size combination
geom_histogram(colour = "blue", fill = "blue", binwidth =
bnWid) +
#detail bar fill colors and histogram bin widths
labs(x=colnames(num_var)[j]) #label x-axis
print(grdPlt)
print(grdPlt + aes(y = ..density..) + labs(y = "density"))
}
}
}
}
hst1 <- function(df,ps = 'off',bsizes=c(30)){
numvar <- df[sapply(df,is.numeric)]
if(ps=='on')
for(i in 1:length(bsizes)){
for(j in 1:(ncol(numvar))){
bwidth <- (max(numvar[,j]) - min(numvar[,j]))/bsizes[i]
var_mean <- mean(numvar[[j]])
mean = sprintf("%8.3f ", var_mean)
counts <- ggplot(numvar, aes(x=numvar[,j])) +
geom_histogram(colour = "blue", fill = "blue", binwidth
= bwidth) +
geom_vline(xintercept = mean(numvar[,j]),colour="red") +
labs(x=colnames(numvar)[j]) # Labeling the x axis
dense <- counts + aes(y = ..density..) + labs(y = "density")
print(counts)
print(dense)
}
}
}
else if(ps == 'grid'){
for(i in 1:length(bsizes)){
cntLst <- list()
denseLst <- list()
for(j in 1:(ncol(numvar))){
bwidth <- (max(numvar[,j]) - min(numvar[,j]))/bsizes[i]
cntGrPlt <- ggplot(numvar, aes(x=numvar[,j])) +
geom_histogram(colour = "blue", fill = "blue", binwidth =
bwidth) +
labs(x=colnames(numvar)[j])
cntLst[[length(cntLst)+1]] <- cntGrPlt
denseLst[[j]] <- cntGrPlt + aes(y = ..density..) + labs(y =
"density")
}
grid.arrange(grobs=cntLst,ncol=2)
grid.arrange(grobs=denseLst,ncol=2)
}
}
}

Custom scatterplot matrix using facet_grid in ggplot2

I'm trying to write a custom scatterplot matrix function in ggplot2 using facet_grid. My data have two categorical variables and one numeric variable.
I'd like to facet (make the scatterplot rows/cols) according to one of the categorical variables and change the plotting symbol according to the other categorical.
I do so by first constructing a larger dataset that includes all combinations (combs) of the categorical variable from which I'm creating the scatterplot panels.
My questions are:
How to use geom_rect to white-out the diagonal and upper panels in facet_grid (I can only make the middle ones black so far)?
How can you move the titles of the facets to the bottom and left hand sides respectively?
How does one remove tick axes and labels for the top left and bottom right facets?
Thanks in advance.
require(ggplot2)
# Data
nC <- 5
nM <- 4
dat <- data.frame(
Control = rep(LETTERS[1:nC], nM),
measure = rep(letters[1:nM], each = nC),
value = runif(nC*nM))
# Change factors to characters
dat <- within(dat, {
Control <- as.character(Control)
measure <- as.character(measure)
})
# Check, lapply(dat, class)
# Define scatterplot() function
scatterplotmatrix <- function(data,...){
controls <- with(data, unique(Control))
measures <- with(data, unique(measure))
combs <- expand.grid(1:length(controls), 1:length(measures), 1:length(measures))
# Add columns for values
combs$value1 = 1
combs$value2 = 0
for ( i in 1:NROW(combs)){
combs[i, "value1"] <- subset(data, subset = Control==controls[combs[i,1]] & measure == measures[combs[i,2]], select = value)
combs[i, "value2"] <- subset(data, subset = Control==controls[combs[i,1]] & measure == measures[combs[i,3]], select = value)
}
for ( i in 1:NROW(combs)){
combs[i,"Control"] <- controls[combs[i,1]]
combs[i,"Measure1"] <- measures[combs[i,2]]
combs[i,"Measure2"] <- measures[combs[i,3]]
}
# Final pairs plot
plt <- ggplot(combs, aes(x = value1, y = value2, shape = Control)) +
geom_point(size = 8, colour = "#F8766D") +
facet_grid(Measure2 ~ Measure1) +
ylab("") +
xlab("") +
scale_x_continuous(breaks = c(0,0.5,1), labels = c("0", "0.5", "1"), limits = c(-0.05, 1.05)) +
scale_y_continuous(breaks = c(0,0.5,1), labels = c("0", "0.5", "1"), limits = c(-0.05, 1.05)) +
geom_rect(data = subset(combs, subset = Measure1 == Measure2), colour='white', xmin = -Inf, xmax = Inf,ymin = -Inf,ymax = Inf)
return(plt)
}
# Call
plt1 <- scatterplotmatrix(dat)
plt1
I'm not aware of a way to move the panel strips (the labels) to the bottom or left. Also, it's not possible to format the individual panels separately (e.g., turn off the tick marks for just one facet). So if you really need these features, you will probably have to use something other than, or in addition to ggplot. You should really look into GGally, although I've never had much success with it.
As far as leaving some of the panels blank, here is a way.
nC <- 5; nM <- 4
set.seed(1) # for reproducible example
dat <- data.frame(Control = rep(LETTERS[1:nC], nM),
measure = rep(letters[1:nM], each = nC),
value = runif(nC*nM))
scatterplotmatrix <- function(data,...){
require(ggplot2)
require(data.table)
require(plyr) # for .(...)
DT <- data.table(data,key="Control")
gg <- DT[DT,allow.cartesian=T]
setnames(gg,c("Control","H","x","V","y"))
fmt <- function(x) format(x,nsmall=1)
plt <- ggplot(gg, aes(x,y,shape = Control)) +
geom_point(subset=.(as.numeric(H)<as.numeric(V)),size=5, colour="#F8766D") +
facet_grid(V ~ H) +
ylab("") + xlab("") +
scale_x_continuous(breaks=c(0,0.5,1), labels=fmt, limits=c(-0.05, 1.05)) +
scale_y_continuous(breaks=c(0,0.5,1), labels=fmt, limits=c(-0.05, 1.05))
return(plt)
}
scatterplotmatrix(dat)
The main feature of this is the use of subset=.(as.numeric(H)<as.numeric(V)) in the call to geom_point(...). This subsets the dataset so you only get a point layer when the condition is met, e.g. in facets where is.numeric(H)<is.numeric(V). This works because I've left the H and V columns as factors and is.numeric(...) operating on a factor returns the levels, not the names.
The rest is just a more compact (and much faster) way of creating what you called comb.

Bar plot with negative and positive values centered on a non-zero value

I am trying to create a bar graph of odds ratios. Interpretationally, they are centered around 1, not 0. Therefore, I would like to have a bar graph where values below 1 are "negative" and values above 1 are "positive" (i.e., a graph of distance from 1).
Effectively, the graph should look like the following plot except that the labels on the y axis should be centered around 1, not 0.
I would like to create this graph not be relabeling of the y axis, but by actually centering it around 1. Is this possible; and if so, how?
Code to create the example graph:
data <- matrix(c(rnorm(5, 1, .5)), 5, 1)
data <- as.data.frame(data)
data[,2] <- data[,1] - .1
data[,3] <- data[,1] + .1
data <- cbind(c(letters[1:5]), data)
names(data) <- c("Category", "OR", "Lower", "Upper")
data.zero <- cbind("Category"=data[,1], data[,2:4] - 1)
require(ggplot2)
ggplot() +
geom_bar(data = data.zero, aes(x=Category, y=OR, fill=Category), stat = "identity") +
scale_fill_brewer(type = "seq", palette = 1) +
geom_errorbar(aes(x=Category, y=OR, ymin=Lower, ymax=Upper),
width=.1, position=position_dodge(), data=data.zero)
Define a simple function to add 1, like so:
plus1_formatter <- function(x) {x +1}
and than add the string
+ scale_y_continuous(labels=plus1_formatter)
to your ggplot command above.
to make it reproducible set seed(123)
before:
after:
If I understand your question correctly you don't want to relabel the y-axis, you want it centered to 1.
## Find the minimum and maximum y
mi <- min(data.zero$Lower)
ma <- max(data.zero$Upper)
Now I check which distance is greater, from 1 to mi or from 1 to ma. I set the ylim by increasing the smaller distance from 1.
if(1 - mi > ma - 1) {
newylim <- c(mi , 2 - mi) ## Lower limit remains same, Upper limit is increased
} else {
newylim <- c(2 - ma, ma) ## Upper limit remains same, Lower limit is decreased
}
then you can plot adding:
+ ylim(newylim)
The plot with your code is in the left and adding the ylim in the right.
Hope it helps,
alex

Parallel co-ordinates plot in R (ggparcoord)

I am facing a somewhat strange situation while plotting a parallel co-ordinates plot using ggparcoord. I am running the following code and it is running perfectly fine:
# Load required packages
require(GGally)
# Load datasets
data(state)
df <- data.frame(state.x77,
State = state.name,
Abbrev = state.abb,
Region = state.region,
Division = state.division
)
# Generate basic parallel coordinate plot
p <- ggparcoord(data = df,
# Which columns to use in the plot
columns = 1:4,
# Which column to use for coloring data
groupColumn = 11,
# Allows order of vertical bars to be modified
order = "anyClass",
# Do not show points
showPoints = FALSE,
# Turn on alpha blending for dense plots
alphaLines = 0.6,
# Turn off box shading range
shadeBox = NULL,
# Will normalize each column's values to [0, 1]
scale = "uniminmax" # try "std" also
)
# Start with a basic theme
p <- p + theme_minimal()
# Decrease amount of margin around x, y values
p <- p + scale_y_continuous(expand = c(0.02, 0.02))
p <- p + scale_x_discrete(expand = c(0.02, 0.02))
# Remove axis ticks and labels
p <- p + theme(axis.ticks = element_blank())
p <- p + theme(axis.title = element_blank())
p <- p + theme(axis.text.y = element_blank())
# Clear axis lines
p <- p + theme(panel.grid.minor = element_blank())
p <- p + theme(panel.grid.major.y = element_blank())
# Darken vertical lines
p <- p + theme(panel.grid.major.x = element_line(color = "#bbbbbb"))
# Move label to bottom
p <- p + theme(legend.position = "bottom")
# Figure out y-axis range after GGally scales the data
min_y <- min(p$data$value)
max_y <- max(p$data$value)
pad_y <- (max_y - min_y) * 0.1
# Calculate label positions for each veritcal bar
lab_x <- rep(1:4, times = 2) # 2 times, 1 for min 1 for max
lab_y <- rep(c(min_y - pad_y, max_y + pad_y), each = 4)
# Get min and max values from original dataset
lab_z <- c(sapply(df[, 1:4], min), sapply(df[, 1:4], max))
# Convert to character for use as labels
lab_z <- as.character(lab_z)
# Add labels to plot
p <- p + annotate("text", x = lab_x, y = lab_y, label = lab_z, size = 3)
# Display parallel coordinate plot
print(p)
I get the following output:
The moment I want to subset the data to display fewer region levels using the following statement:
df<-df[which(df$Region %in% c('South','West','Northeast')),]
I start receiving the following error:
Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
contrasts can be applied only to factors with 2 or more levels
Why am I getting this error when the number of levels I want to display are clearly more than 2?
Any help on this would be much appreciated.
I figured what the problem was. I had to convert the column into factor.
df$Region <- factor(df$Region)
The above piece of code fixes the error.

PCA FactoMineR plot data

I'm running an R script generating plots of the PCA analysis using FactorMineR.
I'd like to output the coordinates for the generated PCA plots but I'm having trouble finding the right coordinates. I found results1$ind$coord and results1$var$coord but neither look like the default plot.
I found
http://www.statistik.tuwien.ac.at/public/filz/students/seminar/ws1011/hoffmann_ausarbeitung.pdf
and
http://factominer.free.fr/classical-methods/principal-components-analysis.html
but neither describe the contents of the variable created by the PCA
library(FactoMineR)
data1 <- read.table(file=args[1], sep='\t', header=T, row.names=1)
result1 <- PCA(data1,ncp = 4, graph=TRUE) # graphs generated automatically
plot(result1)
I found that $ind$coord[,1] and $ind$coord[,2] are the first two pca coords in the PCA object. Here's a worked example that includes a few other things you might want to do with the PCA output...
# Plotting the output of FactoMineR's PCA using ggplot2
#
# load libraries
library(FactoMineR)
library(ggplot2)
library(scales)
library(grid)
library(plyr)
library(gridExtra)
#
# start with a clean slate
rm(list=ls(all=TRUE))
#
# load example data
data(decathlon)
#
# compute PCA
res.pca <- PCA(decathlon, quanti.sup = 11:12, quali.sup=13, graph = FALSE)
#
# extract some parts for plotting
PC1 <- res.pca$ind$coord[,1]
PC2 <- res.pca$ind$coord[,2]
labs <- rownames(res.pca$ind$coord)
PCs <- data.frame(cbind(PC1,PC2))
rownames(PCs) <- labs
#
# Just showing the individual samples...
ggplot(PCs, aes(PC1,PC2, label=rownames(PCs))) +
geom_text()
# Now get supplementary categorical variables
cPC1 <- res.pca$quali.sup$coor[,1]
cPC2 <- res.pca$quali.sup$coor[,2]
clabs <- rownames(res.pca$quali.sup$coor)
cPCs <- data.frame(cbind(cPC1,cPC2))
rownames(cPCs) <- clabs
colnames(cPCs) <- colnames(PCs)
#
# Put samples and categorical variables (ie. grouping
# of samples) all together
p <- ggplot() + theme(aspect.ratio=1) + theme_bw(base_size = 20)
# no data so there's nothing to plot...
# add on data
p <- p + geom_text(data=PCs, aes(x=PC1,y=PC2,label=rownames(PCs)), size=4)
p <- p + geom_text(data=cPCs, aes(x=cPC1,y=cPC2,label=rownames(cPCs)),size=10)
p # show plot with both layers
# Now extract the variables
#
vPC1 <- res.pca$var$coord[,1]
vPC2 <- res.pca$var$coord[,2]
vlabs <- rownames(res.pca$var$coord)
vPCs <- data.frame(cbind(vPC1,vPC2))
rownames(vPCs) <- vlabs
colnames(vPCs) <- colnames(PCs)
#
# and plot them
#
pv <- ggplot() + theme(aspect.ratio=1) + theme_bw(base_size = 20)
# no data so there's nothing to plot
# put a faint circle there, as is customary
angle <- seq(-pi, pi, length = 50)
df <- data.frame(x = sin(angle), y = cos(angle))
pv <- pv + geom_path(aes(x, y), data = df, colour="grey70")
#
# add on arrows and variable labels
pv <- pv + geom_text(data=vPCs, aes(x=vPC1,y=vPC2,label=rownames(vPCs)), size=4) + xlab("PC1") + ylab("PC2")
pv <- pv + geom_segment(data=vPCs, aes(x = 0, y = 0, xend = vPC1*0.9, yend = vPC2*0.9), arrow = arrow(length = unit(1/2, 'picas')), color = "grey30")
pv # show plot
# Now put them side by side in a single image
#
grid.arrange(p,pv,nrow=1)
#
# Now they can be saved or exported...
Adding something extra to Ben's answer. You'll note in the first chart in Ben's response that the labels overlap somewhat. The pointLabel() function in the maptools package attempts to find locations for the labels without overlap. It's not perfect, but you can adjust the positions in the new dataframe (see below) to fine tune if you want. (Also, when you load maptools you get a note about gpclibPermit(). You can ignore it if you're concerned about the restricted licence). The first part of the script below is Ben's script.
# load libraries
library(FactoMineR)
library(ggplot2)
library(scales)
library(grid)
library(plyr)
library(gridExtra)
#
# start with a clean slate
# rm(list=ls(all=TRUE))
#
# load example data
data(decathlon)
#
# compute PCA
res.pca <- PCA(decathlon, quanti.sup = 11:12, quali.sup=13, graph = FALSE)
#
# extract some parts for plotting
PC1 <- res.pca$ind$coord[,1]
PC2 <- res.pca$ind$coord[,2]
labs <- rownames(res.pca$ind$coord)
PCs <- data.frame(cbind(PC1,PC2))
rownames(PCs) <- labs
#
# Now, the code to produce Ben's first chart but with less overlap of the labels.
library(maptools)
PCs$label=rownames(PCs)
# Base plot first for pointLabels() to get locations
plot(PCs$PC1, PCs$PC2, pch = 20, col = "red")
new = pointLabel(PCs$PC1, PCs$PC2, PCs$label, cex = .7)
new = as.data.frame(new)
new$label = PCs$label
# Then plot using ggplot2
(p = ggplot(data = PCs) +
geom_hline(yintercept = 0, linetype = 3, colour = "grey20") +
geom_vline(xintercept = 0, linetype = 3, colour = "grey20") +
geom_point(aes(PC1, PC2), shape = 20, col = "red") +
theme_bw())
(p = p + geom_text(data = new, aes(x, y, label = label), size = 3))
The result is:
An alternative is to use the biplot function from CoreR or biplot.psych from the psych package. This will put the components and the data onto the same figure.
For the decathlon data set, use principal and biplot from the psych package:
library(FactoMineR) #needed to get the example data
library(psych) #needed for principal
data(decathlon) #the data set
pc2 <- principal(decathlon[1:10],2) #just the first 10 columns
biplot(pc2,labels = rownames(decathlon),cex=.5, main="Biplot of Decathlon results")
#this is a call to biplot.psych which in turn calls biplot.
#adjust the cex parameter to change the type size of the labels.
This looks like:
!a biplot http://personality-project.org/r/images/olympic.biplot.pdf
Bill

Resources