how to get total variance explained by each principal component - r

I have created a PCA plot using:
library(SNPRelate)
library(gdsfmt)
vcf.fn <- "input.vcf"
snpgdsVCF2GDS(vcf.fn, "test.gds", method="biallelic.only")
snpgdsSummary("test.gds")
genofile <- snpgdsOpen("test.gds")
pop_code <- read.gdsn(index.gdsn(genofile, "genotype"))
snpset <- snpgdsLDpruning(genofile, autosome.only=FALSE, ld.threshold=0.2, maf= 0.01, missing.rate=0.5)
snpset.id <- unlist(snpset)
pca <- snpgdsPCA(genofile, autosome.only=FALSE, snp.id=snpset.id, num.thread=2)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
tab <- data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")
PCA plot looks like:
[1]: https://i.stack.imgur.com/WBeKT.png
I have created a matrix representing sample names (in rows) and first five PC's (in columns):
sample.id EV1 EV2 EV3 EV4 EV5
1 T11 -0.007433146 -0.038371106 0.079585181 0.069839389 0.12178713
2 T3 -0.014198086 0.069641911 0.006414285 -0.004750456 0.046201258
3 T10 -0.086656303 0.026455731 -0.028758639 -0.015004286 -0.007497732
4 T162 -0.00520634 0.053996842 0.021754194 -0.004660844 0.006939661
5 T163 -0.020055447 0.027697494 -0.006933852 -0.058596466 0.028236645
I want to check how much variation is explained by each PC component. Thank you for your help!

Related

Need help plotting analytical solution of phytoplankton resource competition model in R

I'm working on a one species, two resources phytoplankton competition model based on Tilman's work in the 70s and 80s. I have a dataframe set up for the analytical solution but am really struggling with the syntax to plot the graphs I need. Here is my code so far:
library(dplyr)
r <- 0.1
g1 <- 0.001
g2 <- 0.01
v1 <- 0.1
v2 <- 1
k1 <- 0.01
k2 <- 0.1
d <- 0.15
s1_star = (r*g1*k1*d)-((v1*(r-d))-r*g1*d)
s2_star = (r*g2*k2*d)-((v2*(r-d))-r*g2*d)
s01 = s1_star+((s02-s2_star)*(g1/g2))
params <- list(r = 0.1,
g1 = 0.001,
g2 = 0.01,
d = 0.5,
v1 = 0.1,
v2 = 1,
k1 = 0.01,
k2 = 0.1)
df <- data.frame(s02 = seq(10, 1, -1)) |>
mutate(
s1_star = (r*g1*k1*d)-((v1*(r-d))-r*g1*d),
s2_star = (r*g2*k2*d)-((v2*(r-d))-r*g2*d),
s01 = s1_star+((s02-s2_star)*(g1/g2)), ## Tilman eq 17, supply concentration of resource 1
## in the reservoir that would result in co-limitation given some concentration of
## resource 2 (s20) in the reservoir
s1_limiting_ratio = s02/s01 ## ratio of supply points that result in co-limitation
)
cbind(params, df) |> as.data.frame() -> limiting_ratio
library(ggplot2)
limiting_ratio |> ggplot(aes(x = s1_star, y = s2_star)) + geom_line()
I want to plot s1_star and s2_star as the axes (which I did), but I'm trying to add the s1_limiting_ratio as a line on the graph (it's a ratio of s02/s01, which represents when resource 1 (S1) and resource 2 (S2) are co-limited. Then, I want to plot various values of s01 and s02 on the graph to see where they fall (to determine which resource is limiting to know which resource equation to use, either S1 or S2, in the analytical solution.
I've tried googling ggplot help, and struggling to apply it to the graph I need. I'm still fairly new to R and definitely pretty new to ggplot, so I really appreciate any help and advice!

How to loop a function through groups of data?

I have code that calculate Kaplan-Meier product..
km_mean <- function(x,nd) {
library(tidyverse)
# first remove any missing data
df <- tibble(x,nd) %>% filter(!is.na(x))
x <- df %>% pull(x); nd <- df %>% pull(nd)
# handle cases of all detects or all nondetects; in these situations, no Kaplan-Meier
# estimate is possible or necessary; instead treat all detects as actual concentration estimates
# and all NDs as imputed at half their reporting limits
if (all(nd==0)) return(tibble(mean=mean(x),sd=sd(x)))
if (all(nd==1)) return(tibble(mean=mean(x/2),sd=sd(x/2)))
# for cases with mixed detects and NDs, table by nd status;
# determine unique x values; first subtract epsilon to each nondetect to associate
# larger rank for detects tied with NDs with same reporting limits
eps <- 1e-6
x <- x - nd*eps
nn <- nlevels(factor(x))
# determine number of at-risk values; build kaplan-meier CDF and survival function;
# note: need to augment and adjust <tab> for calculation below to work correctly
km.lev <- as.numeric(levels(factor(x)))
xa <- c(x,max(x)+1); nda <- c(nd,0)
tab <- table(xa,nda)
tab[nn+1,1] <- 0
km.rsk <- cumsum(tab[,1] + tab[,2])
km.cdf <- rev(cumprod(1 - rev(tab[,1])/rev(km.rsk)))[-1]
names(km.cdf) <- as.character(km.lev)
km.surv <- 1 - km.cdf
km.out <- tibble(km.lev,km.rsk=km.rsk[-length(km.rsk)],km.cdf,km.surv)
row.names(km.out) <- NULL
# estimate adjusted mean and SD
xm <- km.lev[1] + sum(diff(km.lev)*km.surv[-length(km.surv)])
dif <- diff(c(0,km.cdf))
xsd <- sqrt(sum(dif*(km.lev - xm)^2))
names(xm) <- NULL; names(xsd) <- NULL
tibble(mean=xm,sd=xsd)
}
My data has three columns, a sample-ID, value (x), and detect/non-detect flag (nd).
a1 0.23 0
a1 2.3 0
a1 1.6 0
a2 3.0 1
a2 3.1 0
a2 2.76 0
How can I adapt the function to run on all a1 samples as a group, then a2, etc.?
I've tried group_by commands, but can't seem to break through.

Adjust implausible imputed values in an optimized way

I have a dataset with some imputed values. According to a predefined edit rule, some of these imputed values are implausible. For that reason, I want to adjust these implausible imputed values, but the adjustment should be as small as possible.
Here is a simplified example:
# Seed
set.seed(111)
# Example data
data <- data.frame(x1 = round(rnorm(200, 5, 5), 0),
x2 = factor(round(runif(200, 1, 3), 0)),
x3 = round(rnorm(200, 2, 10), 0),
x4 = factor(round(runif(200, 0, 5), 0)))
data[data$x1 > 5 & data$x2 == 1, ]$x3 <- 4
data[data$x1 > 5 & data$x2 == 1, ]$x4 <- 5
# Missings
data$x1[sample(1:nrow(data), 25)] <- NA
data$x2[sample(1:nrow(data), 50)] <- NA
data$x3[sample(1:nrow(data), 40)] <- NA
data$x4[sample(1:nrow(data), 35)] <- NA
# Imputation
library("mice")
imp <- mice(data, m = 1)
# Imputed data
data_imp <- complete(imp, "repeated")
# So far everything works well.
# However, there is a predefined edit rule, which should not be violated.
# Edit Rule:
# If x1 > 5 and x2 == 1
# Then x3 > 3 and x4 > 4
# Because of the imputation, some of the observations have implausible values.
implausible <- data_imp[data_imp$x1 > 5 & data_imp$x2 == 1 &
(data_imp$x3 <= 3 | (data_imp$x4 != 4 & data_imp$x4 != 5)), ]
implausible
# Example 1)
# In row 26 x1 has a value > 5 and x2 equals 1.
# For that reason, x3 would have to be larger than 3 (here x3 is -17).
# Like you can see in the original data, x2 has been imputed in row 26.
data[rownames(implausible), ]
# Hence, x2 would have to be adjusted, so that it randomly gets a different category.
# Example 2)
# In row 182 are also implausible values.
# Three of the variables have been imputed in this row.
# Therefore, all/some of the imputed cells would have to be adjusted,
# but the adjustment should be as small as possible.
I have already made some research and found some relevant papers/books, in which some optimization algorithms are described:
Pannekoek & Zhang (2011): https://www.researchgate.net/publication/269410841_Partial_donor_Imputation_with_Adjustments
de Waal, Pannekoek & Scholtus (2011): Handbook of Statistical Data Editing and Imputation
However, I am struggling with the implementation of these algorithms in R. Is there a Package available, which helps with these kind of calculations. I'd really appreciate some help with my code or some hints about the topic!

How can i iteratively do clustering for different clusters (k) values

I have the following PCA data on which i am doing Kmeans clustering:
head(pcdffinal)
PC1 PC2 PC3 PC4 PC5 PC6
1 -9.204228 -2.73517110 2.7975063 0.6794614 -0.84627095 0.4455297
2 2.927245 0.05666389 0.5085896 0.1472800 0.18193152 0.1041490
3 -4.667932 -1.98176361 2.2751862 0.5347725 -0.43314927 0.3222719
4 -1.366505 -0.40858595 0.5005192 0.4507366 -0.54996933 0.5533013
5 -4.689454 -2.77185636 2.4323856 0.7387788 0.49237229 -0.4817083
6 -3.477046 -1.84904214 1.5539558 0.5463861 -0.03231143 0.2814843
opt.cluster<-3
set.seed(115)
pccomp.km <- kmeans(pcdffinal,opt.cluster,nstart=25)
head(pccomp.km$cluster)
[1] 2 1 2 2 2 2
barplot(table(pccomp.km$cluster), col="steelblue")
pccomp.km$tot.withinss #For total within cluster sum of squares.
[1] 13172.59
We can also use a plot to illustrate the groups that the data have been arranged into.
par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means Clustering result with k = ', opt.cluster,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares
library("factoextra")
fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal()
df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as
save this dataset & kmeans model for further use
saveRDS(pccomp.km, "kmeans_model.RDS")
write.csv(df.num_kmeans,"dfnum_kmeans.cluster.csv")
library(cluster)
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)
library(ggfortify)
autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm')
I would like to do Kmeans iteratively for a range of Ks say k=2:6 each time making plots for the respective k as well as saving the models as well as the data as a csv but each done separately for different k's.
Need help to convert the above codes into an iterative with the counter i going from 2 till 6.
original data:
head(df.num_kmeans)
datausage mou revenue calldrop handset2g handset3g smartphone
1 896804.7 2854801 40830.404 27515 7930 19040 20810
2 155932.1 419109 5512.498 5247 2325 2856 3257
3 674983.3 2021183 25252.265 21068 6497 13056 14273
4 522787.2 1303221 14547.380 8865 4693 9439 10746
5 523465.7 1714641 24177.095 25441 8668 12605 14766
6 527062.3 1651303 20153.482 18219 6822 11067 12994
rechargecount rechargesum arpu subscribers
1 4461 235430 197704.10 105822
2 843 39820 34799.21 18210
3 2944 157099 133842.38 71351
4 2278 121697 104681.58 44975
5 2802 144262 133190.55 75860
6 2875 143333 119389.91 63740
Using random forest for accuracy comparison
dfnum.kmeans <- read.csv("dfnum_kmeans.cluster.csv")
table(dfnum.kmeans$cluster.kmeans) # size of each cluster
convert cluster var into a factor
dfnum.kmeans$cluster.kmeans <- as.factor(dfnum.kmeans$cluster.kmeans)
is.factor(dfnum.kmeans$cluster.kmeans)
create training and test sets (75:25 split) using 'caret' package
set.seed(128) # for reproducibility
inTrain_kmeans <- caret::createDataPartition(y = dfnum.kmeans$cluster.kmeans, p = 0.75, list = FALSE)
training_kmeans <- dfnum.kmeans[inTrain_kmeans, ]
testing_kmeans <- dfnum.kmeans[-inTrain_kmeans, ]
set.seed(122)
control <- trainControl(method = "repeatedcv", number = 10,allowParallel = TRUE)
modFit.rfcaret_kmeans <- caret::train(cluster.kmeans~ ., method = "rf",data = training_kmeans, trControl = control, number = 25)
modFit.rfcaret_kmeans$finalModel
pred.test_kmeans = predict(modFit.rfcaret_kmeans, testing_kmeans); confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )
confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )$overall[1]
Assuming that your original dataframe is df.num, the following could save all the files (for different k values) in your working directory:
for (k in 2:6) {
set.seed(115)
pccomp.km <- kmeans(pcdffinal,k,nstart=25)
head(pccomp.km$cluster)
print(paste(k, pccomp.km$tot.withinss)) #For total within cluster sum of squares.
png(paste0('kmeans_proj_',k, '.png'))
par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means Clustering result with k = ', k,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares
dev.off()
png(paste0('kmeans_fviz_',k, '.png'))
print(fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal())
dev.off()
df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as
saveRDS(pccomp.km, paste0("kmeans_model_", k, ".RDS"))
write.csv(df.num_kmeans,paste0("dfnum_kmeans_", k, ".cluster.csv"))
png(paste0('clusplot_',k, '.png'))
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)
dev.off()
png(paste0('autoplot_',k, '.png'))
print(autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm'))
dev.off()
}

Using split function in R

I am trying to simulate three small datasets, which contains x1,x2,x3,x4, trt and IND.
However, when I try to split simulated data by IND using "split" in R I get Warning messages and outputs are correct. Could someone please give me a hint what I did wrong in my R code?
# Step 2: simulate data
Alpha = 0.05
S = 3 # number of replicates
x = 8 # number of covariates
G = 3 # number of treatment groups
N = 50 # number of subjects per dataset
tot = S*N # total subjects for a simulation run
# True parameters
alpha = c(0.5, 0.8) # intercepts
b1 = c(0.1,0.2,0.3,0.4) # for pi_1 of trt A
b2 = c(0.15,0.25,0.35,0.45) # for pi_2 of trt B
b = c(1.1,1.2,1.3,1.4);
##############################################################################
# Scenario 1: all covariates are independent standard normally distributed #
##############################################################################
set.seed(12)
x1 = rnorm(n=tot, mean=0, sd=1);x2 = rnorm(n=tot, mean=0, sd=1);
x3 = rnorm(n=tot, mean=0, sd=1);x4 = rnorm(n=tot, mean=0, sd=1);
###############################################################################
p1 = exp(alpha[1]+b1[1]*x1+b1[2]*x2+b1[3]*x3+b1[4]*x4)/
(1+exp(alpha[1]+b1[1]*x1+b1[2]*x2+b1[3]*x3+b1[4]*x4) +
exp(alpha[2]+b2[1]*x1+b2[2]*x2+b2[3]*x3+b2[4]*x4))
p2 = exp(alpha[2]+b2[1]*x1+b2[2]*x2+b2[3]*x3+b2[4]*x4)/
(1+exp(alpha[1]+b1[1]*x1+b1[2]*x2+b1[3]*x3+b1[4]*x4) +
exp(alpha[2]+b2[1]*x1+b2[2]*x2+b2[3]*x3+b2[4]*x4))
p3 = 1/(1+exp(alpha[1]+b1[1]*x1+b1[2]*x2+b1[3]*x3+b1[4]*x4) +
exp(alpha[2]+b2[1]*x1+b2[2]*x2+b2[3]*x3+b2[4]*x4))
# To assign subjects to one of treatment groups based on response probabilities
tmp = function(x){sample(c("A","B","C"), 1, prob=x, replace=TRUE)}
trt = apply(cbind(p1,p2,p3),1,tmp)
IND=rep(1:S,each=N) #create an indicator for split simulated data
sim=data.frame(x1,x2,x3,x4,trt, IND)
Aset = subset(sim, trt=="A")
Bset = subset(sim, trt=="B")
Cset = subset(sim, trt=="C")
Anew = split(Aset, f = IND)
Bnew = split(Bset, f = IND)
Cnew = split(Cset, f = IND)
The warning message:
> Anew = split(Aset, f = IND)
Warning message:
In split.default(x = seq_len(nrow(x)), f = f, drop = drop, ...) :
data length is not a multiple of split variable
and the output becomes
$`2`
x1 x2 x3 x4 trt IND
141 1.0894068 0.09765185 -0.46702047 0.4049424 A 3
145 -1.2953113 -1.94291045 0.09926239 -0.5338715 A 3
148 0.0274979 0.72971804 0.47194731 -0.1963896 A 3
$`3`
[1] x1 x2 x3 x4 trt IND
<0 rows> (or 0-length row.names)
I have checked my R code several times however, I can't figure out what I did wrong. Many thanks in advance
IND is the global variable for the full data, sim. You want to use the specific one for the subset, eg
Anew <- split(Aset, f = Aset$IND)
It's a warning, not an error, which means split executed successfully, but may not have done what you wanted to do.
From the "details" section of the help file:
f is recycled as necessary and if the length of x is not a multiple of
the length of f a warning is printed. Any missing values in f are
dropped together with the corresponding values of x.
Try checking the length of your IND against the size of your dataframe, maybe.
Not sure what your goal is once you have your data split, but this sounds like a good candidate for the plyr package.
> library(plyr)
> ddply(sim, .(trt,IND), summarise, x1mean=mean(x1), x2sum=sum(x2), x3min=min(x3), x4max=max(x4))
trt IND x1mean x2sum x3min x4max
1 A 1 -0.49356448 -1.5650528 -1.016615 2.0027822
2 A 2 0.05908053 5.1680463 -1.514854 0.8184445
3 A 3 0.22898716 1.8584443 -1.934188 1.6326763
4 B 1 0.01531230 1.1005720 -2.002830 2.6674931
5 B 2 0.17875088 0.2526760 -1.546043 1.2021935
6 B 3 0.13398967 -4.8739380 -1.565945 1.7887837
7 C 1 -0.16993037 -0.5445507 -1.954848 0.6222546
8 C 2 -0.04581149 -6.3230167 -1.491114 0.8714535
9 C 3 -0.41610973 0.9085831 -1.797661 2.1174894
>
Where you can substitute summarise and its following arguments for any function that returns a data.frame or something that can be coerced to one. If lists are the target, ldply is your friend.

Resources