Related
Let's assume four data frames, each with 3 vectors, e.g.
setA <- data.frame(
a1 = c(6,5,2,4,5,3,4,4,5,3),
a2 = c(4,3,1,4,5,1,1,6,3,2),
a3 = c(5,4,5,6,4,6,5,5,3,3)
)
setB <- data.frame(
b1 = c(5,3,4,3,3,6,4,4,3,5),
b2 = c(4,3,1,3,5,2,5,2,5,6),
b3 = c(6,5,4,3,2,6,4,3,4,6)
)
setC <- data.frame(
c1 = c(4,4,5,5,6,4,2,2,4,6),
c2 = c(3,3,4,4,2,1,2,3,5,4),
c3 = c(4,5,4,3,5,5,3,5,5,6)
)
setD <- data.frame(
d1 = c(5,5,4,4,3,5,3,5,5,4),
d2 = c(4,4,3,3,4,3,4,3,4,5),
d3 = c(6,5,5,3,3,4,2,5,5,4)
)
I'm trying to find n number of vectors in each data frame, that have the highest correlation among each other. For this simple example, let's say want to find the n = 1 vectors in each of the k = 4 data frames, that show the overall strongest, positive correlation cor().
I'm not interested in the correlation of vectors within a data frame, but the correlation between data frames, since i wish to pick 1 variable from each set.
Intuitively, I would sum all the correlation coefficients for each combination, i.e.:
sum(cor(cbind(setA$a1, setB$b1, setC$c1, setC$d1)))
sum(cor(cbind(setA$a1, setB$b2, setC$c1, setC$d1)))
sum(cor(cbind(setA$a1, setB$b1, setC$c2, setC$d1)))
... # and so on...
...but this seems like brute-forcing a solution that might be solvable more elegantly, with some kind of clustering-technique?
Anyhow, I was hoping to find a dynamic solution like function(n = 1, ...) where (... for data frames) which would return a list of the highest correlating vector names.
Base on your example I would not go with a really complicated algorithm unless your actual data is huge. This is a simple approach I think gets what you want.
So base on your 4 data frames a creates the list_df and then in the function I just generate all the possible combinations of variables an calculate their correlation. At the end I select the n combinations with highest correlation.
list_df = list(setA,setB,setC,setD)
CombMaxCor = function(n = 1,list_df){
column_names = lapply(list_df,colnames)
mat_comb = expand.grid(column_names)
mat_total = do.call(cbind,list_df)
vec_cor = rep(NA,nrow(mat_comb))
for(i in 1:nrow(mat_comb)){
vec_cor[i] = sum(cor(mat_total[,as.character(unlist(mat_comb[i,]))]))
}
pos_max_temp = rev(sort(vec_cor))[1:n]
pos_max = vec_cor%in%pos_max_temp
comb_max_cor = mat_comb[pos_max,]
return(comb_max_cor)
}
You could use comb function:
fun = function(x){
nm = paste0(names(x),collapse="")
if(!grepl("(.)\\d.*\\1",nm,perl = T))
setNames(sum(cor(x)),nm)
}
unlist(combn(a,4,fun,simplify = FALSE))[1:3]#Only printed the first 3
a1b1c1d1 a1b1c1d2 a1b1c1d3
3.246442 4.097532 3.566949
sum(cor(cbind(setA$a1, setB$b1, setC$c1, setD$d1)))
[1] 3.246442
sum(cor(cbind(setA$a1, setB$b1, setC$c1, setD$d2)))
[1] 4.097532
sum(cor(cbind(setA$a1, setB$b1, setC$c1, setD$d3)))
[1] 3.566949
Here is a function we can use to get n non-repeating columns from each data frame to get the max total correlation:
func <- function(n, ...){
list.df <- list(...)
n.df <- length(list.df)
# 1) First get the correlations
get.two.df.cors <- function(df1, df2) apply(df1, 2,
function(x) apply(df2, 2, function(y) cor(x,y))
)
cor.combns <- lapply(list.df, function(x)
lapply(list.df, function(y) get.two.df.cors(x,y))
)
# 2) Define function to help with aggregating the correlations.
# We will call them for different combinations of selected columns from each df later
# cmbns: given a df corresponding columns to be selected each data frame
# (i-th row corresponds to i-th df),
# return the "total correlation"
get.cmbn.sum <- function(cmbns, cor.combns){
# a helper matrix to help aggregation
# each row represents which two data frames we want to get the correlation sums
df.df <- t(combn(seq(n.df), 2, c))
# convert to list of selections for each df
cmbns <- split(cmbns, seq(nrow(cmbns)))
sums <- apply(df.df, 1,
function(dfs) sum(
cor.combns[[dfs[1]]][[dfs[2]]][cmbns[[dfs[2]]], cmbns[[dfs[1]]]]
)
)
# sum of the sums give the "total correlation"
sum(sums)
}
# 3) Now perform the aggragation
# get the methods of choosing n columns from each of the k data frames
if (n==1) {
cmbns.each.df <- lapply(list.df, function(df) matrix(seq(ncol(df)), ncol=1))
} else {
cmbns.each.df <- lapply(list.df, function(df) t(combn(seq(ncol(df)), n, c)))
}
# get all unique selection methods
unique.selections <- Reduce(function(all.dfs, new.df){
all.dfs.lst <- rep(list(all.dfs), nrow(new.df))
all.new.rows <- lapply(seq(nrow(new.df)), function(x) new.df[x,,drop=F])
for(i in seq(nrow(new.df))){
for(j in seq(length(all.dfs.lst[[i]]))){
all.dfs.lst[[i]][[j]] <- rbind(all.dfs.lst[[i]][[j]], all.new.rows[[i]])
}
}
do.call(c, all.dfs.lst)
}, c(list(list(matrix(numeric(0), nrow=0, ncol=n))), cmbns.each.df))
# for each unique selection method, calculate the total correlation
result <- sapply(unique.selections, get.cmbn.sum, cor.combns=cor.combns)
return( unique.selections[[which.max(result)]] )
}
And now we have:
# n = 1
func(1, setA, setB, setC, setD)
# [,1]
# [1,] 1
# [2,] 2
# [3,] 3
# [4,] 2
# n = 2
func(2, setA, setB, setC, setD)
# [,1] [,2]
# [1,] 1 2
# [2,] 2 3
# [3,] 2 3
# [4,] 2 3
I have 4 data frames of genes, each data frame has gene names as row and about 20 columns of sample data. Thus each matrix has that amount of rows (genes):
A: 10,000 genes
B: 15,000 genes
C: 35,000 genes
D: 12,000 genes
Here is what I tried, it didn't select the complete list of 9,000 common row (Genes)
Data_A = read.csv("matrix_A.csv");
Data_B = read.csv("matrix_B.csv");
Data_C = read.csv("matrix_C.csv");
Data_D = read.csv("matrix_D.csv");
Expr_A = as.data.frame(t(Data_A[, -c(1:8)]))
Expr_B = as.data.frame(t(Data_B[, -c(1:8)]))
Expr_C = as.data.frame(t(Data_C[, -c(1:8)]))
Expr_D = as.data.frame(t(Data_D[, -c(1:8)]))
commonGenes1 = intersect (rownames(Data_A),rownames(Data_D))
commonGenes2 = intersect (rownames(Data_B),rownames(Data_D))
commonGenes3 = intersect (rownames(Data_C),rownames(Data_D))
Data_A = Data_A[commonGenes1,]
Data_B = Data_B[commonGenes2,]
Data_C = Data_C[commonGenes3,]
They all have 9,000 genes in common, though the data are so big I can't do this in Excel. I'm using R to treat the data, is there a way to select the common genes between the 4 data frames in R?
An example of the 4 matrices is here:
http://www.filedropper.com/matrixexample
Let's actually put things in a list (as your title suggests), it's good practice.
list_of_data = list(Data_A, Data_B, Data_C, Data_D)
## for demo purposes, you can use
# list_of_data = list(mtcars[1:6, ], mtcars[4:9, ])
# this will get the intersection of the row.names for everything in the list
common_names = Reduce(intersect, lapply(list_of_data, row.names))
list_of_data = lapply(list_of_data, function(x) { x[row.names(x) %in% common_names,] })
Thanks to #eipi10 for a better way to filter rows for each data frame in a list. Check out the revision history for a lame for loop.
What about this?
# Create some fake data:
set.seed(123)
m1 <- cbind(sample(1:5), round(rnorm(5),2))
m2 <- cbind(sample(1:5), round(rnorm(5),2))
m3 <- cbind(sample(1:5), round(rnorm(5),2))
m4 <- cbind(sample(1:5), round(rnorm(5),2))
rownames(m1) <- LETTERS[sample(1:10, 5)]
rownames(m2) <- LETTERS[sample(1:10, 5)]
rownames(m3) <- LETTERS[sample(1:10, 5)]
rownames(m4) <- LETTERS[sample(1:10, 5)]
ind <- sapply(list(m1,m2,m3), function(x) intersect(rownames(x), rownames(m4)))
mapply(function(x, y) x[rownames(x) %in% y,], x = list(m1,m2,m3), y = ind)
[[1]]
[,1] [,2]
A 4 1.24
D 5 -0.11
E 1 0.18
[[2]]
[,1] [,2]
E 5 1.22
C 2 -0.56
[[3]]
[,1] [,2]
A 2 -0.22
C 1 -0.33
I have a matrix which has one column names and one columns values alternatively.
I am trying to perform a lillie.test on each value columns.
what I have done so far is as follows:
library("nor test")
# create a data
d <- data.frame( v1 = letters[1:10], v2 = runif(10), v3 = LETTERS[1:10],v4 = rnorm(10), v5 = letters[1:10], v6 = runif(10), v7 = LETTERS[1:10], v8 = rnorm(10), stringsAsFactors = TRUE)
# make the sequence
bm = seq(4, length(d), by=2)
# make an empty matrix
results <- matrix(0,length(d),2)
# run the loop
for (i in 2:bm) {
results[i] <- lillie.test(d[,i])
}
errors I get
Error in if (pvalue > 0.1) { : missing value where TRUE/FALSE needed
In addition: Warning messages:
1: In 2:bm : numerical expression has 3 elements: only the first used
2: In mean.default(x) : argument is not numeric or logical: returning NA
3: In Ops.factor(x, mean(x)) : ‘-’ not meaningful for factors
I get a lot of error when I run my for loop but when I apply the lillie test on each column , I see no error
when i run the function on each column, I get outputs without error
for example
results <- lillie.test(d[,2])
results
# Lilliefors (Kolmogorov-Smirnov) normality test
#data: d[, 2]
#D = 0.2996, p-value = 0.01133
Now I have two questions,
1- how to solve this problem?
2- is there any other way to perform this test e.g. using apply or other functions?
if i do as follows: then the results are stored in a list. however, i want to have them in a marix like
v2 v4 v6 v8
statistic 0.2995636 0.1783198 0.1659068 0.145466
p.value 0.01133228 0.4898968 0.6069414 0.793838
bm = seq(2, length(d), by=2)
results <- sapply(d[bm], lillie.test)
results
v2 v4
statistic 0.2995636 0.1783198
p.value 0.01133228 0.4898968
method "Lilliefors (Kolmogorov-Smirnov) normality test" "Lilliefors (Kolmogorov-Smirnov) normality test"
data.name "X[[1L]]" "X[[2L]]"
v6 v8
statistic 0.1659068 0.145466
p.value 0.6069414 0.793838
method "Lilliefors (Kolmogorov-Smirnov) normality test" "Lilliefors (Kolmogorov-Smirnov) normality test"
data.name "X[[3L]]" "X[[4L]]"
Let's check this line:
for (i in 2:bm)
It seems OK, but it isn't. Let's print bm:
bm
[1] 4 6 8
You see, bm is not an integer. I suspect you'd like to concatenate 2 with bm (judging from your data.set), so you can just do this:
bm <- c(2, bm)
bm
[1] 2 4 6 8
Or, do this (and this is better):
bm <- seq(2, 8, 2)
Now you have the values you want to test. I know nothing about the libraries you are using, but let's check your 2:bm
2:bm
[1] 2 3 4
You are running the test on letters (the third column contains strings!).
A quick solution to this would be:
bm <- seq(4, length(d), by=2) ## as you have typed
bm <- c(2, bm)
results <- sapply(d[bm], lillie.test)
d[bm] gives us a subset of our current data.frame, containing only the columns specified by bm, so sapply will do the test for each and every one of those columens and store its results in results.
EDIT: This is probably what you'd like
df <- data.frame(matrix(unlist(results), nrow=4, byrow=F))[c(1,2), ]
you can do that
bm = seq(2, length(d), by=2)
res<- lapply(d[bm], lillie.test)
m<- do.call(cbind, res)
mresults <- m[1:2,]
print(mresults)
# v2 v4 v6 v8
#statistic 0.1724419 0.1113228 0.2189569 0.1650759
#p.value 0.5447331 0.9777699 0.1902882 0.6148843
I have a dataframe from which I would like to perform pairwise comparisons between values of each column. Ultimately I aim to get a cross tabulation of the comparisons, where each value represents the percentage of similarity between the samples in compared columns. For a replicate and what I have tried thus far:
a <- c(1:30)
b <- c(30:1)
c <- c(1:10,30:11)
data <- as.data.frame(matrix(c(a,b,c), ncol = 3, nrow = 30))
fr<-apply(combn(1:length(data), 2), 2, function(x) {
result <- as.data.frame(table(
factor(sign(data[,x[1]] - data[,x[2]]), levels=c(0), labels=c("Fr"))
))
colnames(result)[1] <- paste(x, collapse="|")
return(result)
})
fr # returns a list of each comparison, with its respective similarity count
a<-sapply(fr, unlist) # My attempt to get a dataframe/matrix of the results
t(a)
t(a); sapply(fr, unlist); do.call(cbind, fr) # I get different arrangements, but none in the form:
1|2 0
1|3 10
2|3 0
Once I get a dataframe in that format, it would become more straight forward to get a crosstab table,
V.1 V.2 V.3
V.1 -
V.2 0 -
V.3 10 0 -
This is what I am ultimately looking for, although the values in the cross-tab table would be #/nrow to obtain the respective percentage value. I am not sure if I am going about this the wrong way, but any input would be appreciated
You can try:
Cmbn <- combn(seq_along(data),2)
nm1 <- apply(Cmbn, 2, paste, collapse="|")
f1 <- setNames(
apply(Cmbn, 2, function(x) {
x1 <- sign(data[,x[1]]- data[,x[2]])
table(factor(x1, levels=0, labels="Fr")) #not sure why you wanted a label "Fr" as it didn't appear in the results
}),
nm1)
f1
#1|2 1|3 2|3
#0 10 0
names1 <- paste("V", 1:3, sep=".")
m1 <- matrix(0, 3,3, dimnames=list(names1, names1))
m1[paste(col(m1), row(m1), sep="|") %in% names(f1)] <- f1
d1 <- as.data.frame(m1)
d1[upper.tri(d1, diag=TRUE)] <- "-"
d1
# V.1 V.2 V.3
#V.1 - - -
#V.2 0 - -
#V.3 10 0 -
I've got a set of objects, let's say with the IDs 'A' to 'J'. And I've got two data frames which look the following way (as you can see, the second data frame is symmetric):
df1 <- data.frame(ID = LETTERS[1:5], Var = c(9,13,15,11,28))
df2 <- as.data.frame(matrix(data = c(NA,42,83,74,84,42,NA,26,69,9,83,26,NA,67,95,74,69,67,NA,6,84,9,95,6,NA), ncol = 5, nrow = 5, dimnames = list(df1$ID, df1$ID)))
For example, take the objects 'B' and 'E'. I want to know: Is 13+28 (from df1) less than 9 (from df2)? I'd like to know this for all pairs of objects. The output should be
(a) a logical data frame structured like df2 and
(b) the number of "TRUE" values.
Most of the time I will only need result (b), but sometimes I would also need (a). So if (b) can be calculated without (a) and if this would be significantly faster, then I'd like to have both algorithms in order to select the suitable one dependent on which output I need to answer a particular question.
I'm comparing around 2000 objects, so the algorithm should be reasonably fast. So far I've been only able to implement this with two nested for-loops which is awfully slow. I bet there is a much nicer way to do this, maybe exploiting vectorisation.
This is what it currently looks like:
df3 <- as.data.frame(matrix(data = NA, ncol = nrow(df1), nrow = nrow(df1),
dimnames = list(df1$ID, df1$ID)))
for (i in 2:nrow(df3)){
for (j in 1:(i-1)){
sum.val <- df1[df1$ID == rownames(df3)[i], "Var"] + df1[df1$ID == names(df3)[j], "Var"]
df3[i,j] <- sum.val <= df2[i,j]
}
}
#
Is this what you want?
df3 <- outer(df1$Var, df1$Var, "+")
df3
df4 <- df3 < df2
df4
sum(df4, na.rm = TRUE)
Here's one way to do it...
# Get row and column indices
ind <- t( combn( df1$ID , 2 ) )
# Get totals
tot <- with( df1 , Var[ match( ind[,1] , ID ) ] + Var[ match( ind[,2] , ID ) ] )
# Make df2 a matrix
m <- as.matrix( df2 )
# Total number of values is simply
sum( m[ ind ] > tot )
#[1] 7
# Find which values in upper triangle part of the matrix exceed those from df1 (1 = TRUE)
m[upper.tri(m)] <- m[ ind ] > tot
# A B C D E
#A NA 1 1 1 0
#B 42 NA 1 0 1
#C 83 26 NA 1 1
#D 74 69 67 NA 0
#E 84 9 95 6 NA
This will do what you want.
# Generate the data
df1 <- data.frame(ID = LETTERS[1:5], Var = c(9,13,15,11,28))
df2 <- as.data.frame(matrix(data = c(NA,42,83,74,84,42,NA,26,
69,9,83,26,NA,67,95,74,69,
67,NA,6,84,9,95,6,NA),
ncol = 5, nrow = 5,
dimnames = list(df1$ID, df1$ID)))
# Define a pairwise comparison index matrix using 'combn'
idx <- combn(nrow(df1), 2)
# Create a results matrix
res <- matrix(NA, ncol = ncol(df2), nrow = nrow(df2))
# Loop through 'idx' for each possible comparison (without repeats)
for(i in 1:ncol(idx)){
logiTest <- (df1$Var[idx[1,i]] + df1$Var[idx[2,i]]) < df2[idx[1,i], idx[2,i]]
res[idx[1,i], idx[2, i]] <- logiTest
res[idx[2,i], idx[1, i]] <- logiTest
}
# Count the number of 'true' comparisons
nTrues <- sum(res, na.rm = TRUE)/2
The code simply uses a pairwise comparison index (idx) to define which elements in both df1 and df2 are to be used in each iteration of the 'for loop'. It then uses this same index to define where in the 'res' matrix the answer to the logical test is to be written.
N.B. This code will break down if the order of elements in df1 and df2 are not the same. In such cases, it would be appropriate to use the actual letters to define which values to compare.