Plot a re-leveled pairwise distance matrix in ggplot2 - r

Loading libraries and creating a reproducible example
#Load libraries
set.seed(123)
library(tidyr)
library(ggplot2)
#Creating a fake pairwise matrix
locs <- 5
tmp <- matrix(runif(n = locs*locs),nrow = locs,ncol = locs)
tmp[upper.tri(tmp,diag = T)] <- NA
colnames(tmp) <- LETTERS[1:locs]
rownames(tmp) <- LETTERS[1:locs]
tmp
#Converting into a data frame
tmp1 <- as.data.frame(cbind(rownames(tmp),as.data.frame(tmp)))
names(tmp1)[1] <- "locA"
rownames(tmp1) <- NULL
head(tmp1)
#Changing it to long form and getting rid of NAs
tmp1 <- gather(tmp1, key = "locB",value = "value",-locA)
tmp1 <- tmp1[!is.na(tmp1$value),]
tmp1
#Making a tiled plot based on default levels
ggplot(tmp1, aes(x = locA, y=locB, fill=value, label=round(value,3)))+
geom_tile(color="black")+
geom_text(size=5,color="white")
But for reasons that make more biological sense, I want to change the order in which those comparisons are ordered
#biological order
my.order <- c("A","C","D","B","E")
my.order
#re-leveling
tmp1$locA <- factor(tmp1$locA, levels = my.order,ordered = T)
tmp1$locB <- factor(tmp1$locB, levels = my.order,ordered = T)
tmp1
#the releveled plot
ggplot(tmp1, aes(x = locA, y=locB, fill=value, label=round(value,3)))+
geom_tile(color="black")+
geom_text(size=5,color="white")
I am trying to find a way to get the "B-C" & "B-D" comparisons to be represented in the lower diagonal.
I tried to find a solution with a full matrix and lower.tri(), but have failed so far
#here is the full matrix
x <- tmp
x[is.na(x)] <- 0
y <- t(tmp)
y[is.na(y)] <- 0
full.matrix <- x+y
full.matrix
#the function lower.tri might be useful in this context
lower.tri(full.matrix)

Starting from after tmp and full.matrix are created, if you run:
reordered_mat <- full.matrix[match(my.order, rownames(full.matrix)),
match(my.order, colnames(full.matrix))]
lt_reordered_mat <- replace(reordered_mat, !lower.tri(reordered_mat), NA)
tmp1 <- as.data.frame(cbind(rownames(lt_reordered_mat),as.data.frame(lt_reordered_mat)))
And then rerun all your tmp1 creation code and reordering, then you should get you desired result:
Full reproducible code:
#Load libraries
set.seed(123)
library(tidyr)
library(ggplot2)
#Creating a fake pairwise matrix
locs <- 5
tmp <- matrix(runif(n = locs*locs),nrow = locs,ncol = locs)
tmp[upper.tri(tmp,diag = T)] <- NA
colnames(tmp) <- LETTERS[1:locs]
rownames(tmp) <- LETTERS[1:locs]
x <- tmp
x[is.na(x)] <- 0
y <- t(tmp)
y[is.na(y)] <- 0
full.matrix <- x+y
my.order <- c("A","C","D","B","E")
reordered_mat <- full.matrix[match(my.order, rownames(full.matrix)),
match(my.order, colnames(full.matrix))]
lt_reordered_mat <- replace(reordered_mat, !lower.tri(reordered_mat), NA)
tmp1 <- as.data.frame(cbind(rownames(lt_reordered_mat),as.data.frame(lt_reordered_mat)))
names(tmp1)[1] <- "locA"
rownames(tmp1) <- NULL
#Changing it to long form and getting rid of NAs
tmp1 <- gather(tmp1, key = "locB",value = "value",-locA)
tmp1 <- tmp1[!is.na(tmp1$value),]
#re-leveling
tmp1$locA <- factor(tmp1$locA, levels = my.order,ordered = T)
tmp1$locB <- factor(tmp1$locB, levels = my.order,ordered = T)
#the releveled plot
ggplot(tmp1, aes(x = locA, y=locB, fill=value, label=round(value,3)))+
geom_tile(color="black")+
geom_text(size=5,color="white")

As Mike H. was providing his answer, I created a slightly different solution. I think his answer is better because it's more succinct and doesn't use a for loop.
#Load libraries
set.seed(123)
library(tidyr)
library(ggplot2)
#Creating a fake pairwise matrix
locs <- 5
tmp <- matrix(runif(n = locs*locs),nrow = locs,ncol = locs)
tmp[upper.tri(tmp,diag = T)] <- NA
colnames(tmp) <- LETTERS[1:locs]
rownames(tmp) <- LETTERS[1:locs]
tmp
#Converting into a data frame
tmp1 <- as.data.frame(cbind(rownames(tmp),as.data.frame(tmp)))
names(tmp1)[1] <- "locA"
rownames(tmp1) <- NULL
head(tmp1)
#Changing it to long form and getting rid of NAs
tmp1 <- gather(tmp1, key = "locB",value = "value",-locA)
tmp1 <- tmp1[!is.na(tmp1$value),]
tmp1
#Making a tiled plot based on default levels
ggplot(tmp1, aes(x = locA, y=locB, fill=value, label=round(value,3)))+
geom_tile(color="black")+
geom_text(size=5,color="white")
#biological order
my.order <- c("A","C","D","B","E")
my.order
#re-leveling
tmp1$locA <- factor(tmp1$locA, levels = my.order,ordered = T)
tmp1$locB <- factor(tmp1$locB, levels = my.order,ordered = T)
tmp1
#the releveled plot
ggplot(tmp1, aes(x = locA, y=locB, fill=value, label=round(value,3)))+
geom_tile(color="black")+
geom_text(size=5,color="white")
#reordering tmp by my.order and replacing NAs with zero
x <- tmp
x<- x[my.order,my.order]
x[is.na(x)] <- 0
x
#identifying which values switch from the lower matrix to the upper matrix
y <- x
y[y !=0] <- 1
#figuring out which side of the matrix that needs to be switched to switch locA and locB
if(sum(y[lower.tri(y)]) > sum(y[upper.tri(y)])){ y[lower.tri(y)] <- 0 }
if(sum(y[lower.tri(y)]) == sum(y[upper.tri(y)])){ y[lower.tri(y)] <- 0 }
if(sum(y[lower.tri(y)]) < sum(y[upper.tri(y)])){ y[upper.tri(y)] <- 0 }
#Converting t into a long form data frame
fm <- as.data.frame(cbind(rownames(y),as.data.frame(y)))
names(fm)[1] <- "locA"
rownames(fm) <- NULL
fm <- gather(fm, key = "locB",value = "value",-locA)
#identifying which need to be switched and created an identifer to merge with
fm$action <- ifelse(fm$value == 1,"switch","keep")
fm$both <- paste0(fm$locA,fm$locB)
fm
#creating the same identifer in tmp1
tmp1$both <- paste0(tmp1$locA,tmp1$locB)
head(tmp1)
#merging the fm and tmp1 together
tmp2 <- merge(x = fm[,4:5],y = tmp1,by = "both")
tmp2
#using a for loop to make the necessary switches
i <- NULL
for(i in 1:nrow(tmp2)){
if(tmp2$action[i] == "switch"){
A <- as.character(tmp2$locA[i])
B <- as.character(tmp2$locB[i])
tmp2$locA[i] <- B
tmp2$locB[i] <- A
}
}
tmp2
#re-leveling to my order
tmp2$locA <- factor(tmp2$locA, levels = my.order,ordered = T)
tmp2$locB <- factor(tmp2$locB, levels = my.order,ordered = T)
tmp2
#now the graphic
ggplot(tmp2, aes(x = locA, y=locB, fill=value, label=round(value,3)))+
geom_tile(color="black")+
geom_text(size=5,color="white")

Related

r: Why does the result of appending element to a list manually differs from using for loop?

The plot_ts list, where I added the elements manually, is correct.
Why all the elements of plot_TS are the graph of the len-th column instead?
library(tidyverse)
rowdates <- function(df){
return(as.Date(row.names(df), format='%d/%m/%Y'))
}
plot_i <- function(df, cl){
return (ggplot(data = df) +
geom_point(mapping = aes(x = rowdates(df),
y = as.numeric(df[,cl])),
color = 'blue')
)
}
Graphs_TS <- read.csv2("~/Documents/Graphs_TS.csv", row.names=1)
points_TS <- rownames(Graphs_TS)
# len <- length(points_TS)
len <- 2
plot_TS <- list()
plot_ts <- list()
for (i in 1:len){
plot_TS[[i]] <- plot_i(Graphs_TS, i)
}
plot_ts[[1]] <- plot_i(Graphs_TS, 1)
plot_ts[[2]] <- plot_i(Graphs_TS, 2)

multiple data frames with similar names

I needed to generate array or many data frames from other data frames which only varied in names. This required me to do a lot of copy-paste works. Is it possible that I can make it cleaner but not keep copying and pasting? Follows are two examples from many similar cases of the analysis I am doing now (I will provide codes for reproduction at the end of the question), which I think may be able to make them cleaner with the same approach.
case 1, create an array with data from per_d1,per_d1,per_d3,per_d4,per_d5
perd <- array(dim=c(7,15,5))
perd [,,1] <- as.matrix(per_d$per_d1)
perd [,,2] <- as.matrix(per_d$per_d2)
perd [,,3] <- as.matrix(per_d$per_d3)
perd [,,4] <- as.matrix(per_d$per_d4)
perd [,,5] <- as.matrix(per_d$per_d5)
case 2, create multiple data frames from data with similar names.
dataplot <- dfmak (per_d$per_d1,ge$per_d1$g1,ge$per_d1$g2,ge$per_d1$g3,ge$per_d1$g4,ge$per_d1$g5)
dataplot2 <- dfmak (per_d$per_d2,ge$per_d2$g1,ge$per_d2$g2,ge$per_d2$g3,ge$per_d2$g4,ge$per_d2$g5)
dataplot3 <- dfmak (per_d$per_d3,ge$per_d3$g1,ge$per_d3$g2,ge$per_d3$g3,ge$per_d3$g4,ge$per_d3$g5)
dataplot4 <- dfmak (per_d$per_d4,ge$per_d4$g1,ge$per_d4$g2,ge$per_d4$g3,ge$per_d4$g4,ge$per_d4$g5)
dataplot5 <- dfmak (per_d$per_d5,ge$per_d5$g1,ge$per_d5$g2,ge$per_d5$g3,ge$per_d5$g4,ge$per_d5$g5)
codes for reproduction
N <- 1
CS <- 10.141
S <- seq (7.72,13,0.807)
t <- 15
l <- length (S)
m0 <- 100
exps <- c(0.2, 0.5, 0.9, 1.5, 2)
sd <- c(0.2, 0.5, 0.8, 1.3, 1.8)
names(sd) <- paste("per", seq_along(sd), sep = "")
per <- lapply(sd, function(x){
per <- matrix(nrow = length(S)*N, ncol = t+1)
for (i in 1:dim(per)[1]) {
for (j in 1:t+1){
per [,1] <- replicate (n = N, S)
per [i,j] <- round (abs (rnorm (1, mean = per[i,1], sd =x)),digits=3)
colnames(per) <- c('physical',paste('t', 1:15, sep = ""))
per <- as.data.frame (per)
}
}
per <- per [,-1]
return(per)
}
)
per_d <- lapply(per, function(x){
per_d <- abs (x - 10.141)
}
)
names(per_d) <- paste("per_d", seq_along(sd), sep = "")
gefun <- function (i){
res <- lapply(exps, function(x){
g <- as.matrix (m0 * exp (-x * i))
for (i in 1:l) {
for (j in 1:t){
g [i,j] <- abs((round (rnorm(1,mean = g[i,j],sd=3), digits = 3)))
colnames(g) <- paste('t', 1:ncol(g), sep = "")
g <- as.data.frame(g)
}}
return(g)
}
)
}
ge <- lapply(per_d, gefun)
for (i in 1:length(ge)){
names(ge[[i]]) <- paste("g", seq_along(ge), sep = "")
}
dfmak <- function(df1,df2,df3,df4,df5,df6){
data.frame(stimulus = c (paste0('S',1:3),'CS+',paste0('S',5:7)),
phy_dis = S,
per_dis = c(df1$t1,df1$t2,df1$t3,df1$t4,df1$t5,df1$t6,df1$t7,df1$t8,df1$t9,df1$t10,df1$t11,df1$t12,df1$t13,df1$t14,df1$t15),
trials = rep(1:15, each = 7),
response_0.2 = c (df2$t1,df2$t2,df2$t3,df2$t4,df2$t5,df2$t6,df2$t7,df2$t8,df2$t9,df2$t10,df2$t11,df2$t12,df2$t13,df2$t14,df2$t15),
response_0.5 = c (df3$t1,df3$t2,df3$t3,df3$t4,df3$t5,df3$t6,df3$t7,df3$t8,df3$t9,df3$t10,df3$t11,df3$t12,df3$t13,df3$t14,df3$t15),
response_0.9 = c (df4$t1,df4$t2,df4$t3,df4$t4,df4$t5,df4$t6,df4$t7,df4$t8,df4$t9,df4$t10,df4$t11,df4$t12,df4$t13,df4$t14,df4$t15),
response_1.5 = c (df5$t1,df5$t2,df5$t3,df5$t4,df5$t5,df5$t6,df5$t7,df5$t8,df5$t9,df5$t10,df5$t11,df5$t12,df5$t13,df5$t14,df5$t15),
response_2 = c (df6$t1,df6$t2,df6$t3,df6$t4,df6$t5,df6$t6,df6$t7,df6$t8,df6$t9,df6$t10,df6$t11,df6$t12,df6$t13,df6$t14,df6$t15)
)
}
You can try the followings. But the codes, unfortunately, are not short.
Case 1
a <- lapply(per_d, as.matrix)
b <- c(a, recursive = TRUE)
pred <- array(b, dim = c(7,15,5))
Case 2
The data frames will be stored in a list. You still have to extract them using $ or [[]].
# create empty lists to store the outputs
out <- list()
name <- list()
for(i in 1:5) {
a <- per_d[[i]]
b <- ge[[i]][[1]]
c <- ge[[i]][[2]]
d <- ge[[i]][[3]]
e <- ge[[i]][[4]]
f <- ge[[i]][[5]]
arg <- list(a, b, c, d, e, f)
name[[i]] <- paste0("df_", i)
out[[i]] <- do.call(dfmak, arg)
}
out <- setNames(out, name)

Sum value by Combine all variable using R

Can somebody help me with data manipulation using R? i have data (data.train) like this
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
i need all combination from variable and the final form is like this
I have tried this code:
### Library
library(dplyr)
## datex
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
names.group <- names(data.train)[1:length(data.train)-1]
data.group <- Map(combn, list(names.group), seq_along(names.group), simplify = F) %>% unlist(recursive = F)
find.index <- sapply(data.group, function(x, find.y){
any(find.y %in% x)
}, find.y = c("datex"))
index.group <- NULL
for(i in 2:length(find.index)){
if(find.index[i] == "TRUE"){
index.group[i] <- i
}
}
index.group[is.na(index.group)] <- 0
for(i in 1:length(data.group)){
if(index.group[i] == 0){
data.group[[i]] <- 0
} else {
data.group[[i]] <- data.group[[i]]
}
}
data.group2 <- data.group[sapply(data.group, function(x) any(x != 0))]
combination.result <- lapply(data.group2, FUN = function(x) {
do.call(what = group_by_, args = c(list(data.train), x)) %>% summarise(sumVar = sum(valx))
})
combination.result
but i don't produce what i want. Thanks
You can generate for combinations of length 1 then for combinations of length 2. Use paste to create your Variable column. Then rbindlist all your results to get the final output.
library(data.table)
setDT(data.train)
sumCombi <- function(x, mySep="_") {
data.train[ , sum(Val), by=c("Date", x)][,
list(Date,
Variable=do.call(paste, c(.SD[,x,with=FALSE], list(sep=mySep))),
SumVal=V1)]
}
rbindlist(c(
#combinations with 1 element in each combi
lapply(c("A", "B", "C"), sumCombi)
,
#combinations with 2 elements in each combi
lapply(combn(c("A","B","C"), 2, simplify=FALSE), sumCombi)
), use.names=FALSE)
or more generically/programmatically:
#assuming that your columns are in the middle of the columns while excl. first and last columns
myCols <- names(data.train)[-c(1, ncol(data.train))]
rbindlist(unlist(
lapply(seq_along(myCols), function(n)
combn(myCols, n, sumCombi, simplify=FALSE)
), recursive=FALSE),
use.names=FALSE)

How can I add a data frame name as a title in a corrplot?

I subseted my data set into some data frames and I want to create a pdf file with all correlation plots, but each plot should come with the name of the data frame as a title. Here is my code:
# Function 1 for correlation matrix
cor.mtest <- function(mat, conf.level = 0.95){
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
diag(p.mat) <- 0
diag(lowCI.mat) <- diag(uppCI.mat) <- 1
for(i in 1:(n-1)){
for(j in (i+1):n){
tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
p.mat[i,j] <- p.mat[j,i] <- tmp$p.value
lowCI.mat[i,j] <- lowCI.mat[j,i] <- tmp$conf.int[1]
uppCI.mat[i,j] <- uppCI.mat[j,i] <- tmp$conf.int[2]
}
}
return(list(p.mat, lowCI.mat, uppCI.mat))
}
# Function 2 for correlation plot
correlation <- function(obj){
dat <- select_if(obj, is.numeric)
pval.mat <- cor.mtest(dat,0.95)
title <- #NAME OF THE DATA FRAME#
dat1 <- corrplot(cor(dat,method="pearson"),method="number",type="upper",sig.level=0.05,p.mat = pval.mat[[1]], tl.pos="upper")
dat2 <- corrplot(cor(dat,method="pearson"),add=T,type="lower",method="square", tl.pos="n", title=title, mar=c(0,0,1,0))
return(list(dat1, dat2))
}
# Saving all correlation in a PDF file
par(ask=F)
pdf('correlations.pdf')
corrbv1nona <- correlation(bv1nona)
corrbv2nona <- correlation(bv2nona)
corrbv3nona <- correlation(bv3nona)
corrbv4 <- correlation(bv4)
corrfull1nona <- correlation(full1nona)
corrfull2nona <- correlation(full2nona)
corrfull3nona <- correlation(full3nona)
corrfull4 <- correlation(full4)
corrsl1nona <- correlation(sl1nona)
corrsl2nona <- correlation(sl2nona)
corrsl3nona <- correlation(sl3nona)
corrsl4 <- correlation(sl4)
dev.off()
Or you can extract the name with
title <- deparse(substitute(obj))
You can extract the name with match.call:
title <- as.character(as.list(match.call())$obj)
For your first use of correlation, this should set title to "bv1nona".

IDW parameters in R

I want to perform IDW interpolation using R using the idw command from the gstat package. I have this data:
#settings
library(gstat)
library(dplyr)
library(sp)
library(tidyr)
id_rep <- rep(c(1,2), 20)
f <- rep(c(930,930.2), each=20)
perc <- rep(c(90, 80), each=10)
x <- sample(1:50, 40)
y <- sample(50:100, 40)
E <- runif(40)
df <- data.frame(id_rep, perc, x,y, f, E)
df_split <- split(df, list(df$id_rep, df$perc, df$f), drop = TRUE, sep="_")
#grid
x.range <- range(df$x)
y.range <- range(df$y)
grid <- expand.grid(x = seq(x.range[1], x.range[2], by=1),
y = seq(y.range[1], y.range[2], by=1))
coordinates(grid) <- ~x + y
#interpolation
lst_interp_idw <- lapply(df_split, function(X) {
coordinates(X) <- ~x + y
E_idw <- idw(E~ 1, X, grid, idp=1, nmax=3) %>% as.data.frame()
df_interp <- select(E_idw, x,y,E_pred=var1.pred)
df_interp
})
df_interp_idw <- bind_rows(lst_interp_idw, .id = "interact") %>%
separate(interact, c("id_rep", "perc", "f"), sep = "\\_")
Now I want to perform each run with different idp and nmax parameters within certain values​ (idp from 1 to 3 by 0.5, and nmax 3 to 6 by 1) and get out a data frame with columns for each combination of idp and nmax values. I try with two for loops but it doesn't work.
EDIT
the code that doesn't work is:
idp = seq(from = 1, to = 3, by = 0.5)
nmax = seq(from = 3, to = 6, by = 1)
...
for(i in idp) {
for(j in nmax)
{ E_idw= idw(E ~ 1, X, grid, nmax = i, idp = j)
}
}
...
Here is a way how to store the result of every iteration in a list.
#settings
#install.packages("gstat")
library(gstat)
library(dplyr)
library(sp)
library(tidyr)
id_rep <- rep(c(1,2), 20)
f <- rep(c(930,930.2), each=20)
perc <- rep(c(90, 80), each=10)
x <- sample(1:50, 40)
y <- sample(50:100, 40)
E <- runif(40)
df <- data.frame(id_rep, perc, x,y, f, E)
df_split <- split(df, list(df$id_rep, df$perc, df$f), drop = TRUE, sep="_")
#grid
x.range <- range(df$x)
y.range <- range(df$y)
grid <- expand.grid(x = seq(x.range[1], x.range[2], by=1),
y = seq(y.range[1], y.range[2], by=1))
coordinates(grid) <- ~x + y
# ==============================================
# NEW function
# ==============================================
idp = seq(from = 1, to = 3, by = 0.5)
nmax = seq(from = 3, to = 6, by = 1)
#interpolation
lst_interp_idw <- lapply(df_split, function(X) {
coordinates(X) <- ~x + y
df_interp <- vector(length(idp)*length(nmax), mode = "list" )
k <- 0
for(i in idp) {
for(j in nmax) {
print(paste(i, j))
# Iterator
k <- k + 1
E_idw= idw(E ~ 1, X, grid, nmax = i, idp = j) %>% as.data.frame()
df_interp[[k]] <- select(E_idw, x,y,E_pred=var1.pred)
}
}
return(df_interp)
})
# ==============================================
Some plausibility checks (lapply is applied to 8 list elements and 20 variations are calculated):
length(lst_interp_idw) # 8
length(lst_interp_idw[[1]]) #20
length(lst_interp_idw[[1]]) #20
It should be easy for you to adapt the last line of your code
df_interp_idw <- bind_rows(lst_interp_idw, .id = "interact") %>%
separate(interact, c("id_rep", "perc", "f"), sep = "\\_")
to format the output in the desired format. This highly depends on how you want to present the different interpolation alternatives.

Resources