PCA FactoMineR plot data - r

I'm running an R script generating plots of the PCA analysis using FactorMineR.
I'd like to output the coordinates for the generated PCA plots but I'm having trouble finding the right coordinates. I found results1$ind$coord and results1$var$coord but neither look like the default plot.
I found
http://www.statistik.tuwien.ac.at/public/filz/students/seminar/ws1011/hoffmann_ausarbeitung.pdf
and
http://factominer.free.fr/classical-methods/principal-components-analysis.html
but neither describe the contents of the variable created by the PCA
library(FactoMineR)
data1 <- read.table(file=args[1], sep='\t', header=T, row.names=1)
result1 <- PCA(data1,ncp = 4, graph=TRUE) # graphs generated automatically
plot(result1)

I found that $ind$coord[,1] and $ind$coord[,2] are the first two pca coords in the PCA object. Here's a worked example that includes a few other things you might want to do with the PCA output...
# Plotting the output of FactoMineR's PCA using ggplot2
#
# load libraries
library(FactoMineR)
library(ggplot2)
library(scales)
library(grid)
library(plyr)
library(gridExtra)
#
# start with a clean slate
rm(list=ls(all=TRUE))
#
# load example data
data(decathlon)
#
# compute PCA
res.pca <- PCA(decathlon, quanti.sup = 11:12, quali.sup=13, graph = FALSE)
#
# extract some parts for plotting
PC1 <- res.pca$ind$coord[,1]
PC2 <- res.pca$ind$coord[,2]
labs <- rownames(res.pca$ind$coord)
PCs <- data.frame(cbind(PC1,PC2))
rownames(PCs) <- labs
#
# Just showing the individual samples...
ggplot(PCs, aes(PC1,PC2, label=rownames(PCs))) +
geom_text()
# Now get supplementary categorical variables
cPC1 <- res.pca$quali.sup$coor[,1]
cPC2 <- res.pca$quali.sup$coor[,2]
clabs <- rownames(res.pca$quali.sup$coor)
cPCs <- data.frame(cbind(cPC1,cPC2))
rownames(cPCs) <- clabs
colnames(cPCs) <- colnames(PCs)
#
# Put samples and categorical variables (ie. grouping
# of samples) all together
p <- ggplot() + theme(aspect.ratio=1) + theme_bw(base_size = 20)
# no data so there's nothing to plot...
# add on data
p <- p + geom_text(data=PCs, aes(x=PC1,y=PC2,label=rownames(PCs)), size=4)
p <- p + geom_text(data=cPCs, aes(x=cPC1,y=cPC2,label=rownames(cPCs)),size=10)
p # show plot with both layers
# Now extract the variables
#
vPC1 <- res.pca$var$coord[,1]
vPC2 <- res.pca$var$coord[,2]
vlabs <- rownames(res.pca$var$coord)
vPCs <- data.frame(cbind(vPC1,vPC2))
rownames(vPCs) <- vlabs
colnames(vPCs) <- colnames(PCs)
#
# and plot them
#
pv <- ggplot() + theme(aspect.ratio=1) + theme_bw(base_size = 20)
# no data so there's nothing to plot
# put a faint circle there, as is customary
angle <- seq(-pi, pi, length = 50)
df <- data.frame(x = sin(angle), y = cos(angle))
pv <- pv + geom_path(aes(x, y), data = df, colour="grey70")
#
# add on arrows and variable labels
pv <- pv + geom_text(data=vPCs, aes(x=vPC1,y=vPC2,label=rownames(vPCs)), size=4) + xlab("PC1") + ylab("PC2")
pv <- pv + geom_segment(data=vPCs, aes(x = 0, y = 0, xend = vPC1*0.9, yend = vPC2*0.9), arrow = arrow(length = unit(1/2, 'picas')), color = "grey30")
pv # show plot
# Now put them side by side in a single image
#
grid.arrange(p,pv,nrow=1)
#
# Now they can be saved or exported...

Adding something extra to Ben's answer. You'll note in the first chart in Ben's response that the labels overlap somewhat. The pointLabel() function in the maptools package attempts to find locations for the labels without overlap. It's not perfect, but you can adjust the positions in the new dataframe (see below) to fine tune if you want. (Also, when you load maptools you get a note about gpclibPermit(). You can ignore it if you're concerned about the restricted licence). The first part of the script below is Ben's script.
# load libraries
library(FactoMineR)
library(ggplot2)
library(scales)
library(grid)
library(plyr)
library(gridExtra)
#
# start with a clean slate
# rm(list=ls(all=TRUE))
#
# load example data
data(decathlon)
#
# compute PCA
res.pca <- PCA(decathlon, quanti.sup = 11:12, quali.sup=13, graph = FALSE)
#
# extract some parts for plotting
PC1 <- res.pca$ind$coord[,1]
PC2 <- res.pca$ind$coord[,2]
labs <- rownames(res.pca$ind$coord)
PCs <- data.frame(cbind(PC1,PC2))
rownames(PCs) <- labs
#
# Now, the code to produce Ben's first chart but with less overlap of the labels.
library(maptools)
PCs$label=rownames(PCs)
# Base plot first for pointLabels() to get locations
plot(PCs$PC1, PCs$PC2, pch = 20, col = "red")
new = pointLabel(PCs$PC1, PCs$PC2, PCs$label, cex = .7)
new = as.data.frame(new)
new$label = PCs$label
# Then plot using ggplot2
(p = ggplot(data = PCs) +
geom_hline(yintercept = 0, linetype = 3, colour = "grey20") +
geom_vline(xintercept = 0, linetype = 3, colour = "grey20") +
geom_point(aes(PC1, PC2), shape = 20, col = "red") +
theme_bw())
(p = p + geom_text(data = new, aes(x, y, label = label), size = 3))
The result is:

An alternative is to use the biplot function from CoreR or biplot.psych from the psych package. This will put the components and the data onto the same figure.
For the decathlon data set, use principal and biplot from the psych package:
library(FactoMineR) #needed to get the example data
library(psych) #needed for principal
data(decathlon) #the data set
pc2 <- principal(decathlon[1:10],2) #just the first 10 columns
biplot(pc2,labels = rownames(decathlon),cex=.5, main="Biplot of Decathlon results")
#this is a call to biplot.psych which in turn calls biplot.
#adjust the cex parameter to change the type size of the labels.
This looks like:
!a biplot http://personality-project.org/r/images/olympic.biplot.pdf
Bill

Related

Changing aesthetics in ggplot generated by svars package in R

I'm using the svars package to generate some IRF plots. The plots are rendered using ggplot2, however I need some help with changing some of the aesthetics.
Is there any way I can change the fill and alpha of the shaded confidence bands, as well as the color of the solid line? I know in ggplot2 you can pass fill and alpha arguments to geom_ribbon (and col to geom_line), just unsure of how to do the same within the plot function of this package's source code.
# Load Dataset and packages
library(tidyverse)
library(svars)
data(USA)
# Create SVAR Model
var.model <- vars::VAR(USA, lag.max = 10, ic = "AIC" )
svar.model <- id.chol(var.model)
# Wild Bootstrap
cores <- parallel::detectCores() - 1
boot.svar <- wild.boot(svar.model, n.ahead = 30, nboot = 500, nc = cores)
# Plot the IRFs
plot(boot.svar)
I'm also looking at the command for a historical decomposition plot (see below). Is there any way I could omit the first two facets and plot only the bottom three lines on the same facet?
hist.decomp <- hd(svar.model, series = 1)
plot(hist.decomp)
Your first desired result is easily achieved by resetting the aes_params after calling plot. For your second goal. There is probably an approach to manipulate the ggplot object. Instead my approach below constructs the plot from scratch. Basically I copy and pasted the data wrangling code from vars:::plot.hd and filtered the prepared dataset for the desired series:
# Plot the IRFs
p <- plot(boot.svar)
p$layers[[1]]$aes_params$fill <- "pink"
p$layers[[1]]$aes_params$alpha <- .5
p$layers[[2]]$aes_params$colour <- "green"
p
# Helper to convert to long dataframe. Source: svars:::plot.hd
hd2PlotData <- function(x) {
PlotData <- as.data.frame(x$hidec)
if (inherits(x$hidec, "ts")) {
tsStructure = attr(x$hidec, which = "tsp")
PlotData$Index <- seq(from = tsStructure[1], to = tsStructure[2],
by = 1/tsStructure[3])
PlotData$Index <- as.Date(yearmon(PlotData$Index))
}
else {
PlotData$Index <- 1:nrow(PlotData)
PlotData$V1 <- NULL
}
dat <- reshape2::melt(PlotData, id = "Index")
dat
}
hist.decomp <- hd(svar.model, series = 1)
dat <- hd2PlotData(hist.decomp)
dat %>%
filter(grepl("^Cum", variable)) %>%
ggplot(aes(x = Index, y = value, color = variable)) +
geom_line() +
xlab("Time") +
theme_bw()
EDIT One approach to change the facet labels is via a custom labeller function. For a different approach which changes the facet labels via the data see here:
myvec <- LETTERS[1:9]
mylabel <- function(labels, multi_line = TRUE) {
data.frame(variable = labels)
}
p + facet_wrap(~variable, labeller = my_labeller(my_labels))

Multiple plot in R in a single page

I'm having trouble displaying the multiple graphs on the same page. I'm having a data frame with 18 numerical columns. For each column, I need to show its histogram and boxplot on the same page with a 4*9 grid. Following is what I tried. But I need to show it along with the boxplot as well. Through a for a loop if possible. Can someone please help me to do it.
library(gridExtra)
library(ggplot2)
p <- list()
for(i in 1:18){
x <- my_data[,i]
p[[i]] <- ggplot(gather(x), aes(value)) +
geom_histogram(bins = 10) +
facet_wrap(~key, scales = 'free_x')
}
do.call(grid.arrange,p)
I received the following graph.
When following is tried, I'm getting the graph in separate pages
library(dplyr)
dat2 <- my_data %>% mutate_all(scale)
# Boxplot from the R trees dataset
boxplot(dat2, col = rainbow(ncol(dat2)))
par(mfrow = c(2, 2)) # Set up a 2 x 2 plotting space
# Create the loop.vector (all the columns)
loop.vector <- 1:4
p <- list()
for (i in loop.vector) { # Loop over loop.vector
# store data in column.i as x
x <- my_data[,i]
# Plot histogram of x
p[[i]] <-hist(x,
main = paste("Question", i),
xlab = "Scores",
xlim = c(0, 100))
plot_grid(p, label_size = 12)
}
You can assemble the base R boxplot and the ggplot object generated with facet_wrap together using the R package patchwork:
library(ggplot2)
library(patchwork)
p <- ggplot(mtcars, aes(x = mpg)) +
geom_histogram() +
facet_wrap(~gear)
wrap_elements(~boxplot(split(mtcars$mpg, mtcars$gear))) / p
ggsave('test.png', width = 6, height = 8, units = 'in')

Get multiple polygons for scattered data in R

I have point cloud data of an area (x,y,z coordinates)
The plot of X and Y looks like:
I am trying to get polygons of different clusters in this data. I tried the following:
points <- df [,1:2] # x and y coordinates
pts <- st_as_sf(points, coords=c('X','Y'))
conc <- concaveman(pts, concavity = 0.5, length_threshold = 0)
Seems like I just get a single polygon binding the whole data. conc$polygons is a list of one variable.
How can I define multiple polygons? What am I missing when I am using concaveman and what all it can provide?
It's hard to tell from your example what variable defines your clusters. Below is an example with some simulated clusters using ggplot2 and data.table (adapted from here).
library(data.table)
library(ggplot2)
# Simulate data:
set.seed(1)
n_cluster = 50
centroids = cbind.data.frame(
x=rnorm(5, mean = 0, sd=5),
y=rnorm(5, mean = 0, sd=5)
)
dt = rbindlist(
lapply(
1:nrow(centroids),
function(i) {
cluster_dt = data.table(
x = rnorm(n_cluster, mean = centroids$x[i]),
y = rnorm(n_cluster, mean = centroids$y[i]),
cluster = i
)
}
)
)
dt[,cluster:=as.factor(cluster)]
# Find convex hull of each point by cluster:
hulls = dt[,.SD[chull(x,y)],by=.(cluster)]
# Plot:
p = ggplot(data = dt, aes(x=x, y=y, colour=cluster)) +
geom_point() +
geom_polygon(data = hulls,aes(fill=cluster,alpha = 0.5)) +
guides(alpha=F)
This produces the following output:
Edit
If you don't have predefined clusters, you can use a clustering algorithm. As a simple example, see below for a solution using kmeans with 5 centroids.
# Estimate clusters (e.g. kmeans):
dt[,km_cluster := as.factor(kmeans(.SD,5)$cluster),.SDcols=c("x","y")]
# Find convex hull of each point:
hulls = dt[,.SD[chull(x,y)],by=.(km_cluster)]
# Plot:
p = ggplot(data = dt, aes(x=x, y=y, colour=km_cluster)) +
geom_point() +
geom_polygon(data = hulls,aes(fill=km_cluster,alpha = 0.5)) +
guides(alpha=F)
In this case the output for the estimated clusters is almost equivalent to the constructed ones.

R Subset of pam, Arrange multiple figures in one

I'm struggling with the following problem:
I use pam to cluster my dataset v in 7 clusters:
x <- pam(v,7)
I know that there is a vector clustering in x which contains the according numbers of clusters.
I would like to get a subset of x which only contains cluster 1.
Is this possible?
Edit:
Here is an example. Cluster iris in three clusters and plot them.
library(ggfortify)
library(cluster)
v <- iris[-5]
x <- pam(v,3)
autoplot(x, frame = TRUE, frame.type = 'norm')
The question: How can I plot only the first cluster? It should look like the first plot without cluster 2 and 3.
Edit: I think I found a solution. Therefore I don't use autoplot anymore but calculate the convex hull of every cluster and plot it.
library(cluster)
library(plyr)
library(ggplot2)
library(ggrepel)
find_hull <- function(df) df[chull(df$x, df$y),]
v<-iris[-5]
pp <- pam(v,3)
n<-princomp(pp$data, scores = TRUE, cor = ncol(pp$data) != 2)$scores
df<-data.frame(n[,1],n[,2],pp$clustering)
colnames(df)<-c("x","y","z")
hulls <- ddply(df, "z", find_hull)
p<-qplot(x,y,data=df,color=as.factor(z))+
geom_polygon(data=hulls, alpha=1, fill=NA)+
geom_text_repel(aes(label = rownames(df)),arrow = arrow(length = unit(0.00, 'inches'), angle = 0.00),size=5.5,colour="grey55")+
theme_classic(base_size = 16)+
theme(axis.line=element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(),axis.ticks=element_blank(),
axis.title.x=element_blank(),axis.title.y=element_blank(),legend.position="none",
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
p
df2<-df[df$z==1,]
hulls <- ddply(df2, "z", find_hull)
p1<-qplot(x,y,data=df2,color=as.factor(z))+
geom_polygon(data=hulls, alpha=0.8, fill=NA)+
geom_text_repel(aes(label = rownames(df2)),arrow = arrow(length = unit(0.00, 'inches'), angle = 0.00),size=5.5,colour="grey25")+
theme_classic(base_size = 16)+
theme(axis.line=element_blank(),axis.text.x=element_blank(),axis.text.y=element_blank(),axis.ticks=element_blank(),
axis.title.x=element_blank(),axis.title.y=element_blank(),legend.position="none",
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())+
p1
Now I want to plot both figures in one device. I have already tried the multiplot from cookbook-r but it gives the error
Error: Aesthetics must be either length 1 or the same as the data (26): label, x, y
It must be because of the labels I guess.
I also tried
grid.arrange(p,p1, ncol=1)
from the gridExtra package but it gives the same error.
Is there any other option to arrange multiple figures with labels in one figure?

How to remove center of range from ggplot histogram in R

I generated the histogram with the following code:
# Load Data
file <- "SharedData.csv"
data <- read.csv(file,header = TRUE,sep = ",")
## Bin Levels
data$xLevel <- cut(data$xLevel,
breaks = quantile(data$xLevel,(0:5)/5),
labels = paste("Quant",1:5,sep = "."),
include.lowest = TRUE,ordered_result = TRUE)
# Histogram
g <- ggplot(data, aes(x=xTime,color = xLevel)) +
geom_histogram(aes(y=..density..),
binwidth=100)
g
How do I create the above histogram with an x axis that goes from 0-300 and from 1500-2400, but not include 300-1500? The unit here is military time.
Data: https://www.dropbox.com/s/e5gaym7dhefs04e/SharedData.csv?dl=0
According to https://groups.google.com/forum/#!topic/ggplot2/jSrL_FnS8kc and Using ggplot2, can I insert a break in the axis? it does not seem to be possible. you can plots 2 graphics instead
first create a new column splitting the data in two
data$Block <- ifelse(data$xTime <=500, "A", "B")
and then plot the graphics
library(scales)
# Histogram
g <- ggplot(data, aes(x=xTime,color = xLevel)) +
geom_histogram(aes(y=..density..),
binwidth=100) + facet_grid(.~Block, scales = "free_x")
g

Resources