dataframe to correlation matrix - r

I have a data frame in R (df) which looks like this:
colA colB
A,B 0.5
A,C 8
B,A 0.5
B,C 9
C,A 8
C,B 9
It represents correlation values obtained by running a certain software.
Now, I would like to convert this data frame to a correlation matrix to be plotted with the Corr() function:
DESIRED OUTPUT:
A B C
A 1 0.5 8
B 0.5 1 9
C 8 9 1
Please, any suggestion about the code I can utilise?

Data:
input <- structure(list(colA = c("A,B", "A,C", "B,A", "B,C", "C,A", "C,B"
), colB = c(0.5, 8, 0.5, 9, 8, 9)), class = "data.frame", row.names = c(NA, -6L))
Solution:
## separate that column "colA" into 2
rc <- read.csv(text = input$colA, header = FALSE)
# V1 V2
#1 A B
#2 A C
#3 B A
#4 B C
#5 C A
#6 C B
tapply(input$colB, unname(rc), FUN = identity, default = 1)
# A B C
#A 1.0 0.5 8
#B 0.5 1.0 9
#C 8.0 9.0 1
Note 1: OP has carelessly made-up data. Correlation is never bigger than 1.
Note 2: Thanks thelatemail for suggesting simply using read.csv instead of scan + matrix + asplit, as was in my initial answer.
Remark 1: If using xtabs, we have to modify diagonal elements to 1 later.
Remark 2: Matrix indexing is also a good approach, but takes more lines of code.
Remark 3: "reshaping" solution is also a good idea.
rc$value <- input$colB
reshape2::acast(rc, V1 ~ V2, fill = 1)
# A B C
#A 1.0 0.5 8
#B 0.5 1.0 9
#C 8.0 9.0 1

Something like that?
# create your input df:
df<-data.frame(colA=c("A,B","A,C","B,A","B,C","C,A","C,B"),value=c(0.5,8,0.5,9,8,9))
# split ID column
df[,c("col.A","col.B")]<- matrix(ncol=2,unlist(strsplit(df$colA,",")),byrow = T)
# reshape
library(reshape2)
dcast( df , col.A~col.B ,fill=1)

Related

extract and format data from dataset into matrix in R

I want to make this dataframe
into this matrix
I have tried:
x <- read.csv("sample1.csv")
ax <- matrix(c(x[1,1],x[2,1],x[1,3],x[1,1],x[3,1],x[1,4],x[1,1],x[4,1],x[1,5],x[1,1],x[5,1],x[1,6],x[1,1],x[6,1],x[1,7],x[2,1],x[1,1],x[2,2],x[2,1],x[3,1],x[2,4],x[2,1],x[4,1],x[2,5],x[2,1],x[5,1],x[2,6],x[3,1],x[6,1],x[2,7],x[3,1],x[1,1],x[3,2],x[3,1],x[2,1],x[3,3],x[3,1],x[4,1],x[3,5],x[3,1],x[5,1],x[3,6],x[3,1],x[6,1],x[3,7],x[4,1],x[1,1],x[4,2],x[4,1],x[2,1],x[4,3],x[4,1],x[3,1],x[4,4],x[4,1],x[5,1],x[4,6],x[4,1],x[6,1],x[4,7],x[5,1],x[1,1],x[2,2],x[5,1],x[2,1],x[2,4],x[5,1],x[3,1],x[2,5],x[5,1],x[4,1],x[2,6],x[5,1],x[6,1],x[2,7],x[6,1],x[1,1],x[2,2],x[6,1],x[2,1],x[2,4],x[6,1],x[3,1],x[2,5],x[6,1],x[4,1],x[2,6],x[6,1],x[5,1],x[2,7]),10,3, byrow=TRUE)
bx <- ax[order(ax[,3], decreasing = TRUE),]
But it's not beautiful at all, and also it's gonna be lots of work if I got different sample data.
So I wish to simplified it if possible, any suggestion?
This can be achieved by using melt() function from reshape2 package:
> a = matrix(c(1:9), nrow = 3, ncol = 3, dimnames = list(LETTERS[1:3], letters[1:3]))
> a
a b c
A 1 4 7
B 2 5 8
C 3 6 9
> library(reshape2)
> melt(a, na.rm = TRUE)
Var1 Var2 value
1 A a 1
2 B a 2
3 C a 3
4 A b 4
5 B b 5
6 C b 6
7 A c 7
8 B c 8
9 C c 9

Aggregated rolling average with a conditional statement in R

I have a data frame that follows the following format.
match team1 team2 winningTeam
1 A D A
2 B E E
3 C F C
4 D C C
5 E B B
6 F A A
7 A D D
8 D A A
What I want to do is to crate variables that calculates the form of both team 1 and 2 over the last x matches. For example, I would want to create a variable called team1_form_last3_matches which for match 8 would be 0.33 (as they won 1 of their last 3 matches) and there would also be a variable called team2_form_last3_matches which would be 0.66 in match 8 (as they won 2 of their last 3 matches). Ideally I would like to be able to specify the number of previous matches to be considered when calculating the teamx_form_lasty variable and those variables to be automatically created. I have tried a bunch of approaches using dplyr, zoo rolling mean functions and a load of nested for / if statements. However, I have not quite cracked it and certainly not in an elegant way. I feel like I am missing a simple solution to this generic problem. Any help would be much appreciated!
Cheers,
Jack
This works for t1l3, you will need to replicate it for t2.
dat <- data.frame(match = c(1:8), team1 = c("A","B","C","D","E","F","A","D"), team2 = c("D","E","F","C","B","A","D","A"), winningTeam = c("A","E","C","C","B","A","D","A"),stringsAsFactors = FALSE)
dat$t1l3 <- c(NA,sapply(2:nrow(dat),function(i) {
df <- dat[1:(i-1),] #just previous games, i.e. excludes current game
df <- df[df$team1==dat$team1[i] | df$team2==dat$team1[i],] #just those containing T1
df <- tail(df,3) #just the last three (or fewer if there aren't three previous games)
return(sum(df$winningTeam==dat$team1[i])/nrow(df)) #total wins/total games (up to three)
}))
How about something like:
dat <- data.frame(match = c(1:8), team1 = c("A","B","C","D","E","F","A","D"), team2 = c("D","E","F","C","B","A","D","A"), winningTeam = c("A","E","C","C","B","A","D","A"))
match team1 team2 winningTeam
1 1 A D A
2 2 B E E
3 3 C F C
4 4 D C C
5 5 E B B
6 6 F A A
7 7 A D D
8 8 D A A
Allteams <- c("A","B","C","D","E","F")
# A vectorized function for you to use to do as you ask:
teamX_form_lastY <- function(teams, games, dat){
sapply(teams, function(x) {
games_info <- rowSums(dat[,c("team1","team2")] == x) + (dat[,"winningTeam"] == x)
lookup <- ifelse(rev(games_info[games_info != 0])==2,1,0)
games_won <- sum(lookup[1:games])
if(length(lookup) < games) warning(paste("maximum games for team",x,"should be",length(lookup)))
games_won/games
})
}
teamX_form_lastY("A", 4, dat)
A
0.75
# Has a warning for the number of games you should be using
teamX_form_lastY("A", 5, dat)
A
NA
Warning message:
In FUN(X[[i]], ...) : maximum games for team A should be 4
# vectorized input
teamX_form_lastY(teams = c("A","B"), games = 2, dat = dat)
A B
0.5 0.5
# so you ca do all teams
teamX_form_lastY(teams = Allteams, 2, dat)
A B C D E F
0.5 0.5 1.0 0.5 0.5 0.0

how to generate grouping variable based on correlation?

library(magrittr)
library(dplyr)
V1 <- c("A","A","A","A","A","A","B","B","B","B", "B","B","C","C","C","C","C","C","D","D","D","D","D","D","E","E","E","E","E","E")
V2 <- c("A","B","C","D","E","F","A","B","C","D","E","F","A","B","C","D","E","F","A","B","C","D","E","F","A","B","C","D","E","F")
cor <- c(1,0.8,NA,NA,NA,NA,0.8,1,NA,NA,NA,NA,NA,NA,1,0.8,NA,NA,NA,NA,0.8,1,NA,NA,NA,NA,NA,NA,1,0.9)
df <- data.frame(V1,V2,cor)
# exclude rows where cor=NA
df <- df[complete.cases(df)==TRUE,]
This is the full data frame, cor=NA represents a correlation smaller than 0.8
df
V1 V2 cor
1 A A 1.0
2 A B 0.8
7 B A 0.8
8 B B 1.0
15 C C 1.0
16 C D 0.8
21 D C 0.8
22 D D 1.0
29 E E 1.0
30 E F 0.9
In the above df, F is not in V1, meaning that F is not of interest
so here I remove rows where V2=F (more generally, V2 equals to value that is not in V1)
V1.LIST <- unique(df$V1)
df.gp <- df[which(df$V2 %in% V1.LIST),]
df.gp
V1 V2 cor
1 A A 1.0
2 A B 0.8
7 B A 0.8
8 B B 1.0
15 C C 1.0
16 C D 0.8
21 D C 0.8
22 D D 1.0
29 E E 1.0
So now, df.gp is the dataset I need to work on
I drop the unused level in V2 (which is F in the example)
df.gp$V2 <- droplevels(df.gp$V2)
I do not want to exclude the autocorrelated variables, in case some of the V1 are not correlated with others, and I would like to put each of them in a separated group
By looking at the cor, A and B are correlated, C and D are correalted, and E belongs to a group by itself.
Therefore, the example here should have three groups.
The way I see this, you may have complicated things by working your data straight into a data.frame. I took the liberty of transforming it back to a matrix.
library(reshape2)
cormat <- as.matrix(dcast(data = df,formula = V1~V2))[,-1]
row.names(cormat) <- colnames(cormat)[-length(colnames(cormat))]
cormat
After I had your correlation matrix, it is easy to see which indices or non NA values are shared with other variables.
a <- apply(cormat, 1, function(x) which(!is.na(x)))
a <- data.frame(t(a))
a$var <- row.names(a)
row.names(a) <- NULL
a
X1 X2 var
1 1 2 A
2 1 2 B
3 3 4 C
4 3 4 D
5 5 6 E
Now either X1 or X2 determines your unique groupings.
Edited by cyrusjan:
The above script is a possible solution when assuming we already select the rows in with cor >= a, where a is a threshold taken as 0.8 in the above question.
Contributed by alexis_laz:
By using cutree and hclust, we can set the threshold in the script (i.e. h=0.8) as blow.
cor.gp <- data.frame(cor.gp =
cutree(hclust(1 - as.dist(xtabs(cor ~ V1 + V2, df.gp))), h = 0.8))

How to keep and remove columns with certain condition simultaneously

I have 8 columns of variables which I must keep column 1 to 3. For column 4 to 8 I need to keep those with only 3 levels and drop which does not qualify that condition.
I tried the following command
data3 <- data2[,sapply(data2,function(col)length(unique(col)))==3]
It managed to retain the variables with 3 levels, but deleted my first 3 columns.
You could do a two step process:
data4 <- data2[1:3]
#Your answer for the second part here:
data3 <- data2[,sapply(data2,function(col)length(unique(col)))==3]
merge(data3,data4)
Depending on what you would like your expected output to be, could try with the option all =TRUE inside the merge().
I would suggest another approach:
x = 1:3
cbind(data2[x], Filter(function(i) length(unique(i))==3, data2[-x]))
# 1 2 3 5
#1 a 1 3 b
#2 b 2 4 b
#3 c 3 5 b
#4 d 4 6 a
#5 e 5 7 c
#6 f 6 8 c
#7 g 7 9 c
#8 h 8 10 a
#9 i 9 11 c
#10 j 10 12 b
Data:
data2 = setNames(
data.frame(letters[1:10],
1:10,
3:12,
sample(letters[1:10],10, replace=T),
sample(letters[1:3],10, replace=T)),
1:5)
Assuming that the columns 4:8 are factor class, we can also use nlevels to filter the columns. We create 'toKeep' as the numeric index of columns to keep, and 'toFilter' as numeric index of columns to filter. We subset the dataset into two: 1) using the 'toKeep' as the index (data2[toKeep]), 2) using the 'toFilter', we further subset the dataset by looping with sapply to find the number of levels (nlevels), create logical index (==3) to filter the columns and cbind with the first subset.
toKeep <- 1:3
toFilter <- setdiff(seq_len(ncol(data2)), n)
cbind(data2[toKeep], data2[toFilter][sapply(data2[toFilter], nlevels)==3])
# V1 V2 V3 V4 V6
#1 B B D C B
#2 B D D A B
#3 D E B A B
#4 C B E C A
#5 D D A D E
#6 E B A A B
data
set.seed(24)
data2 <- as.data.frame(matrix(sample(LETTERS[1:5], 8*6, replace=TRUE), ncol=8))

change data.frame column into rows in R

A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
I would like to have this:
df
1 2
A 1 6
B 2 7
C 3 8
D 4 9
E 5 0
If your dataframe is truly in that format, then all of your vectors will be character vectors. Or, you basically have a character matrix and you could do this:
data.frame(t(df))
It would be better, though, to just define it the way you want it from the get-go
df <- data.frame(c('A','B','C','D','E'),
c(1, 2, 3, 4, 5),
c(6, 7, 8, 9, 0))
You could also do this
df <- data.frame(LETTERS[1:5], 1:5, c(6:9, 0))
If you wanted to give the columns names, you could do this
df <- data.frame(L = LETTERS[1:5], N1 = 1:5, N2 = c(6:9, 0))
Sometimes, if I use read.DIF of Excel data the data gets transposed. Is that how you got the original data in? If so, you can call
read.DIF(filename, transpose = T)
to get the data in the correct orientation.
I really recommend data.table approach without manual steps becauce they are error-prone
A <- c(1,6)
B <- c(2,7)
C <- c(3,8)
D <- c(4,9)
E <- c(5,0)
df <- data.frame(A,B,C,D,E)
df
library('data.table')
dat.m <- melt(as.data.table(df, keep.rownames = "Vars"), id.vars = "Vars") # https://stackoverflow.com/a/44128640/54964
dat.m
Output
A B C D E
1 1 2 3 4 5
2 6 7 8 9 0
Vars variable value
1: 1 A 1
2: 2 A 6
3: 1 B 2
4: 2 B 7
5: 1 C 3
6: 2 C 8
7: 1 D 4
8: 2 D 9
9: 1 E 5
10: 2 E 0
R: 3.4.0 (backports)
OS: Debian 8.7

Resources