Related
I am using WGCNA package for network analysis with following steps:
Data input
Generate Modules
Get gene id
Phenotype x module correlation
I want to use the package to include the phenotype data together with the gene expression matrix to find which genes group with the phenotypes. Then, I want to get the module of interest and do a network map and check which genes relate to the phenotypes.
I generated modules like:
library(WGCNA)
options(stringsAsFactors = FALSE)
enableWGCNAThreads()
lnames = load(file = "dataInput.RData");
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=40, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
##Constructing the gene network and identifying modules is now a simple function call:
net_unsigned = blockwiseModules(datExpr, power = 6,
TOMType = "unsigned", minModuleSize = 30, maxBlockSize = 300,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "PopulusTOM_signed",
verbose = 5)
##maxBlockSize = The total number of genes you have in your gene expression matrix that passed the filter from Data_Input scrip
##Plotting graph
pdf("Dendogram_Modules_signed.pdf", width = 30, height = 30);
##Convert labels to colors for plotting
mergedColors = labels2colors(net_unsigned$colors)
##Plot the dendrogram and the module colors underneath
plotDendroAndColors(net_unsigned$dendrograms[[1]], mergedColors[net_unsigned$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
##Save
moduleLabels = net_unsigned$colors
moduleColors = labels2colors(net_unsigned$colors)
MEs = net_unsigned$MEs;
geneTree = net_unsigned$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
file = "unsigned-networkConstruction-auto.RData")
This generates modules and then I correlated a module with one phenotype. How can I include the phenotypic data with gene expression? Thank you!
I'm plotting decision trees built with partykit in ggparty, and struggling to rotate the tree branches around branch nodes- as in, change the order they are displayed. I'm hoping to plot a tree with the branches displayed in an order according to the values of the terminal nodes. This would be somewhat comparable to using reorder() in a standard ggplot geom.
For instance, outlined below using the WeatherPlay data from the ggparty vignette, can the branches of this tree be rotated so that terminal node geom_bar() plots are displayed in increasing order of proportion "yes"? In this case, this would mean "sunny", then "rainy", then "overcast".
For my project I'll have terminal nodes with boxplots, but I'm guessing that the method for rotating branches is modular and can be repurposed.
## Playing with default WeatherPlay data as demonstrated in ggparty examples here:
# https://cran.r-project.org/web/packages/ggparty/vignettes/ggparty-graphic-partying.html
library(partykit)
library(ggparty)
data("WeatherPlay", package = "partykit")
sp_o <- partysplit(1L, index = 1:3)
sp_h <- partysplit(3L, breaks = 75)
sp_w <- partysplit(4L, index = 1:2)
pn <- partynode(1L, split = sp_o, kids = list(
partynode(2L, split = sp_h, kids = list(
partynode(3L, info = "yes"),
partynode(4L, info = "no"))),
partynode(5L, info = "yes"),
partynode(6L, split = sp_w, kids = list(
partynode(7L, info = "yes"),
partynode(8L, info = "no")))))
py <- party(pn, WeatherPlay)
# Node plots
n1 <- partynode(id = 1L, split = sp_o, kids = lapply(2L:4L, partynode))
t2 <- party(n1,
data = WeatherPlay,
fitted = data.frame(
"(fitted)" = fitted_node(n1, data = WeatherPlay),
"(response)" = WeatherPlay$play,
check.names = FALSE),
terms = terms(play ~ ., data = WeatherPlay)
)
t2 <- as.constparty(t2)
# Plot tree
ggparty(t2) +
geom_edge() +
geom_edge_label() +
geom_node_splitvar() +
geom_node_plot(gglist = list(geom_bar(aes(x = "", fill = play),
position = position_fill()),
xlab("play")),
# draw only one label for each axis
shared_axis_labels = TRUE,
# draw line between tree and legend
legend_separator = TRUE
)
I am plotting multiple shapefiles using spplot. Here's a data to construct that
library(raster)
library(randomcoloR)
my.shp <- getData('GADM', country = 'BRA', level = 2)
my.shp$ID<- 1:nrow(my.shp)
My data consists of a variable X for 10 years as shown where each column is a year
df <- matrix(sample(100:5000, 55040, replace = T), nrow = 5504, ncol = 10)
df <- data.frame(ID = 1:nrow(my.shp), df)
my.dat <- merge(my.shp, df, by = "ID")
variable.names <- paste0("X",1:10)
spplot(my.dat, rev(variable.names), col = NA, at = seq(from = 100, to = 5000, by = 500),
col.regions = distinctColorPalette(length(seq(from = 100, to = 5000, by = 500))),
main = list(label = "TEST"))
My problem is this plot takes so much time (around an hour) to get plotted and was wondering if there is something inherently wrong in the code itself that it is taking too long to plot. My laptop has a 32 GB RAM.
Thanks
I haven't compared this plot to your spplot because I don't want to spend an hour waiting for it.
Instead I'm proposing to use library(mapdeck) to plot an interactive map, which takes a matter of seconds.
Two things to note
You need a Mapbox Access token
You need to convert the sp object to sf
library(raster)
my.shp <- getData('GADM', country = 'BRA', level = 2)
my.shp$ID <- 1:nrow(my.shp)
df <- matrix(sample(100:5000, 55040, replace = T), nrow = 5504, ncol = 10)
df <- data.frame(ID = 1:nrow(my.shp), df)
my.dat <- merge(my.shp, df, by = "ID")
library(sf)
sf <- sf::st_as_sf( my.dat )
library(mapdeck)
set_token( "YOUR_MAPBOX_TOKEN" )
mapdeck() %>%
add_sf(
data = sf
, fill_colour = "GID_2"
)
Are you willing/able to switch to sf instead of sp?
The sf plot function is considerably faster than spplot, although the layout differs a bit.
library(sf)
my.dat_sf <- st_as_sf(my.dat)
plot(my.dat_sf[rev(variable.names)], max.plot=10, breaks=c(seq(from = 100, to = 5000, by = 500),5000),
pal = distinctColorPalette(length(seq(from = 100, to = 5000, by = 500))),
main = "TEST", border=NA, key.pos=4)
Additionally, you could try to simplify the polygon with rmapshaper::ms_simplify() for Spatial*-objects or sf::st_simplify() for SimpleFeatures, which lets you reduce the object size by quite a bit, depending on the given dTolerance. Thus plotting, will also be faster with simplified polygons.
The original SpatialPolygon:
format(object.size(my.dat_sf), units="Kb")
"25599.2 Kb"
and a simplified SimpleFeature:
dat_sf_simple <- st_transform(my.dat_sf, crs = 3035)
dat_sf_simple <- st_simplify(dat_sf_simple, dTolerance = 1000, preserveTopology = T)
dat_sf_simple <- st_transform(dat_sf_simple, crs = 4326)
format(object.size(dat_sf_simple), units="Kb")
"7864.2 Kb"
Plot the simplified SimpleFeature, which takes about 1 minute on my machine with 8GB RAM.
plot(dat_sf_simple[rev(variable.names)], max.plot=10, breaks=c(seq(from = 100, to = 5000, by = 500),5000),
pal = distinctColorPalette(length(seq(from = 100, to = 5000, by = 500))),
main = "TEST", border=NA, key.pos=4)
You could also try out with ggplot2, but I am pretty sure the most performant solution will be the sf plot.
library(ggplot2)
library(dplyr)
library(tidyr)
dat_sf_simple_gg <- dat_sf_simple %>%
dplyr::select(rev(variable.names), geometry) %>%
gather(VAR, SID, -geometry)
ggplot() +
geom_sf(data = dat_sf_simple_gg, aes(fill=SID)) +
facet_wrap(~VAR, ncol = 2)
Given a data frame containing mixed variables (i.e. both categorical and continuous) like,
digits = 0:9
# set seed for reproducibility
set.seed(17)
# function to create random string
createRandString <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
df <- data.frame(ID=c(1:10), name=sample(letters[1:10]),
studLoc=sample(createRandString(10)),
finalmark=sample(c(0:100),10),
subj1mark=sample(c(0:100),10),subj2mark=sample(c(0:100),10)
)
I perform unsupervised feature selection using the package FactoMineR
df.princomp <- FactoMineR::FAMD(df, graph = FALSE)
The variable df.princomp is a list.
Thereafter, to visualize the principal components I use
fviz_screeplot() and fviz_contrib() like,
#library(factoextra)
factoextra::fviz_screeplot(df.princomp, addlabels = TRUE,
barfill = "gray", barcolor = "black",
ylim = c(0, 50), xlab = "Principal Component",
ylab = "Percentage of explained variance",
main = "Principal Component (PC) for mixed variables")
factoextra::fviz_contrib(df.princomp, choice = "var",
axes = 1, top = 10, sort.val = c("desc"))
which gives the following Fig1
and Fig2
Explanation of Fig1: The Fig1 is a scree plot. A Scree Plot is a simple line segment plot that shows the fraction of total variance in the data as explained or represented by each Principal Component (PC). So we can see the first three PCs collectively are responsible for 43.8% of total variance. The question now naturally arises, "What are these variables?". This I have shown in Fig2.
Explanation of Fig2: This figure visualizes the contribution of rows/columns from the results of Principal Component Analysis (PCA). From here I can see the variables, name, studLoc and finalMark are the most important variables that can be used for further analysis.
Further Analysis- where I'm stuck at: To derive the contribution of the aforementioned variables name, studLoc, finalMark. I use the principal component variable df.princomp (see above) like df.princomp$quanti.var$contrib[,4]and df.princomp$quali.var$contrib[,2:3].
I've to manually specify the column indices [,2:3] and [,4].
What I want: I want to know how to do dynamic column index assignment, such that I do not have to manually code the column index [,2:3] in the list df.princomp?
I've already looked at the following similar questions 1, 2, 3 and 4 but cannot find my solution? Any help or suggestions to solve this problem will be helpful.
Not sure if my interpretation of your question is correct, apologies if not. From what I gather you are using PCA as an initial tool to show you what variables are the most important in explaining the dataset. You then want to go back to your original data, select these variables quickly without manual coding each time, and use them for some other analysis.
If this is correct then I have saved the data from the contribution plot, filtered out the variables that have the greatest contribution, and used that result to create a new data frame with these variables alone.
digits = 0:9
# set seed for reproducibility
set.seed(17)
# function to create random string
createRandString <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
df <- data.frame(ID=c(1:10), name=sample(letters[1:10]),
studLoc=sample(createRandString(10)),
finalmark=sample(c(0:100),10),
subj1mark=sample(c(0:100),10),subj2mark=sample(c(0:100),10)
)
df.princomp <- FactoMineR::FAMD(df, graph = FALSE)
factoextra::fviz_screeplot(df.princomp, addlabels = TRUE,
barfill = "gray", barcolor = "black",
ylim = c(0, 50), xlab = "Principal Component",
ylab = "Percentage of explained variance",
main = "Principal Component (PC) for mixed variables")
#find the top contributing variables to the overall variation in the dataset
#here I am choosing the top 10 variables (although we only have 6 in our df).
#note you can specify which axes you want to look at with axes=, you can even do axes=c(1,2)
f<-factoextra::fviz_contrib(df.princomp, choice = "var",
axes = c(1), top = 10, sort.val = c("desc"))
#save data from contribution plot
dat<-f$data
#filter out ID's that are higher than, say, 20
r<-rownames(dat[dat$contrib>20,])
#extract these from your original data frame into a new data frame for further analysis
new<-df[r]
new
#finalmark name studLoc
#1 53 b POTYQ0002N
#2 73 i LWMTW1195I
#3 95 d VTUGO1685F
#4 39 f YCGGS5755N
#5 97 c GOSWE3283C
#6 58 g APBQD6181U
#7 67 a VUJOG1460V
#8 64 h YXOGP1897F
#9 15 j NFUOB6042V
#10 81 e QYTHG0783G
Based on your comment, where you said you wanted to 'Find variables with value greater than 5 in Dim.1 AND Dim.2 and save these variables to a new data frame', I would do this:
#top contributors to both Dim 1 and 2
f<-factoextra::fviz_contrib(df.princomp, choice = "var",
axes = c(1,2), top = 10, sort.val = c("desc"))
#save data from contribution plot
dat<-f$data
#filter out ID's that are higher than 5
r<-rownames(dat[dat$contrib>5,])
#extract these from your original data frame into a new data frame for further analysis
new<-df[r]
new
(This keeps all the original variables in our new data frame since they all contributed more than 5% to the total variance)
There are a lot of ways to extract contributions of individual variables to PCs. For numeric input, one can run a PCA with prcomp and look at $rotation (I spoke to soon and forgot you've got factors here so prcomp won't work directly). Since you are using factoextra::fviz_contrib, it makes sense to check how that function extracts this information under the hood. Key factoextra::fviz_contrib and read the function:
> factoextra::fviz_contrib
function (X, choice = c("row", "col", "var", "ind", "quanti.var",
"quali.var", "group", "partial.axes"), axes = 1, fill = "steelblue",
color = "steelblue", sort.val = c("desc", "asc", "none"),
top = Inf, xtickslab.rt = 45, ggtheme = theme_minimal(),
...)
{
sort.val <- match.arg(sort.val)
choice = match.arg(choice)
title <- .build_title(choice[1], "Contribution", axes)
dd <- facto_summarize(X, element = choice, result = "contrib",
axes = axes)
contrib <- dd$contrib
names(contrib) <- rownames(dd)
theo_contrib <- 100/length(contrib)
if (length(axes) > 1) {
eig <- get_eigenvalue(X)[axes, 1]
theo_contrib <- sum(theo_contrib * eig)/sum(eig)
}
df <- data.frame(name = factor(names(contrib), levels = names(contrib)),
contrib = contrib)
if (choice == "quanti.var") {
df$Groups <- .get_quanti_var_groups(X)
if (missing(fill))
fill <- "Groups"
if (missing(color))
color <- "Groups"
}
p <- ggpubr::ggbarplot(df, x = "name", y = "contrib", fill = fill,
color = color, sort.val = sort.val, top = top, main = title,
xlab = FALSE, ylab = "Contributions (%)", xtickslab.rt = xtickslab.rt,
ggtheme = ggtheme, sort.by.groups = FALSE, ...) + geom_hline(yintercept = theo_contrib,
linetype = 2, color = "red")
p
}
<environment: namespace:factoextra>
So it's really just calling facto_summarize from the same package. By analogy you can do the same thing, simply call:
> dd <- factoextra::facto_summarize(df.princomp, element = "var", result = "contrib", axes = 1)
> dd
name contrib
ID ID 0.9924561
finalmark finalmark 21.4149175
subj1mark subj1mark 7.1874438
subj2mark subj2mark 16.6831560
name name 26.8610132
studLoc studLoc 26.8610132
And that's the table corresponding to your figure 2. For PC2 use axes = 2 and so on.
Regarding "how to programmatically determine the column indices of the PCs", I'm not 100% sure I understand what you want, but if you just want to say for column "finalmark", grab its contribution to PC3 you can do the following:
library(tidyverse)
# make a tidy table of all column names in the original df with their contributions to all PCs
contribution_df <- map_df(set_names(1:5), ~factoextra::facto_summarize(df.princomp, element = "var", result = "contrib", axes = .x), .id = "PC")
# get the contribution of column 'finalmark' by name
contribution_df %>%
filter(name == "finalmark")
# get the contribution of column 'finalmark' to PC3
contribution_df %>%
filter(name == "finalmark" & PC == 3)
# or, just the numeric value of contribution
filter(contribution_df, name == "finalmark" & PC == 3)$contrib
BTW I think ID in your example is treated as numeric instead of factor, but since it's just an example I'm not bothering with it.
I have a problem forecasting this non stationery data(https://drive.google.com/file/d/14o5hHe8zxR0onRWq0mZNcYqI101O0Dkw/view?usp=sharing) using Auto Arima.
Please review my code.
# Read Data
r = read.csv('../Amazon/Amazon1.csv', header = TRUE, stringsAsFactors = FALSE)
# Time Series construction
ts = ts(t(r[,1:25]), frequency = 12, start = c(2016,01) )
# plotting Time series
ts.plot(ts[,1:2],type = 'b', xlab = 'Monthly Cycle', ylab = 'Number of Sales', main = "(TIME SERIES) Amazon Sales Cycle of multiple products for 24 months",col=c(rep("black",1),rep("red",2)))
legend("topleft",cex=.65,legend = ts[1,1:2], col = 1:ncol(ts), lty = 1)
set1 = ts[2:20,1]
#set2 = ts[15:20,1]
set1 = as.numeric(set1)
#set2 = as.numeric(set2)
# Building Forecasting models
mf = meanf(set1,h=4,level=c(90,95),fan=FALSE,lambda=NULL)
plot(mf)
mn = naive(set1,h=4,level=c(90,95),fan=FALSE,lambda=NULL)
plot(mn)
md = rwf(set1,h=4,drift=T,level=c(90,95),fan=FALSE,lambda=NULL)
plot(md)
# Checking Accuracy
accuracy(mf)
accuracy(mn)
accuracy(md)
# Identifying Stationarity/Non-Stationarity(unit Root testing)
adf = adf.test(set1)
adf
kpss = kpss.test(set1)
kpss
ndiffs(set1)
diff_data = diff(set1)
adf.test(diff_data) # Rerunning unit test on differenced data
# Identifying Seasonality/Trend
Stl = stl(set1,s.window='periodic')
# ARIMA modelling
ar_set1 = forecast::auto.arima(diff(diff_data), approximation=FALSE,trace=FALSE, stationary = TRUE)
forecast(ar_set1, h = 5)
ar_set2 = forecast::ets(diff(diff_data))
ar_set3 = forecast::nnetar(diff(diff_data), approximation=FALSE,trace=FALSE, stationary = TRUE)
# Prediction
predict(ar_set1, n.ahead = 5,se.fit = TRUE)
predict(ar_set2, n.ahead = 5,se.fit = TRUE)
predict(ar_set3, n.ahead = 5,se.fit = TRUE)
plot(forecast(ar_set1,h=6))
points(1:length(diff(diff_data)),fitted(ar_set1),type="l",col="green")
plot(forecast(ar_set2,h=6))
points(1:length(diff(diff_data)),fitted(ar_set2),type="l",col="green")
plot(forecast(ar_set3,h=6))
points(1:length(diff(diff_data)),fitted(ar_set3),type="l",col="green")
accuracy(ar_set1, test = diff_data)
accuracy(ar_set2, test = diff_data)
accuracy(ar_set3, test = diff_data)
I'm unable to get the forecast values using either of those 3 methods. Where am i going wrong?